Skip to content

Commit

Permalink
Merge branch 'master' into add-mimir-heartbeart-alert
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson authored Mar 28, 2024
2 parents 897e3a6 + 7c98e67 commit 31301ab
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 10 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- Add Heartbeat alert for mimir.
- Add missing alert about loki containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124).
- Add missing alert about mimir containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124).

## [3.5.0] - 2024-03-27

Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ Any Alert includes:
- `cancel_if_.*`


### Specific alert labels

- `all_pipelines: true`: When adding this label to an alert, you are sure the alert will be send to opsgenie, even if the installation is not a stable installation.

#### Routing

Alertmanager does the routing based on the labels menitoned above.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,32 @@ kind: PrometheusRule
metadata:
labels:
{{- include "labels.common" . | nindent 4 }}
name: grafana.all.rules
name: loki.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: loki
rules:
# Coming from https://github.com/giantswarm/giantswarm/issues/30124
# This alert ensures Loki containers are not restarting too often (flappiness).
# If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive).
- alert: LokiRestartingTooOften
annotations:
description: '{{`Loki containers are restarting too often.`}}'
opsrecipe: loki/
expr: |
increase(
kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="loki"}[1h]
) > 5
for: 5m
labels:
area: managedservices
# This label is used to ensure the alert go through even for non-stable installations
all_pipelines: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
# Rules inspired from loki-mixins - https://github.com/grafana/loki/blob/main/production/loki-mixin-compiled/alerts.yaml
- alert: LokiRequestErrors
annotations:
Expand Down
24 changes: 20 additions & 4 deletions helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,36 @@ spec:
type: "heartbeat"
# TODO(@team-atlas): remove once we use mimir alertmanager
namespace: "monitoring" # Needed due to https://github.com/prometheus-operator/prometheus-operator/issues/3737
# Coming from https://github.com/giantswarm/giantswarm/issues/30124
# This alert ensures Mimir containers are not restarting too often (flappiness).
# If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive).
# This alert will not page for the prometheus-buddy.
- alert: MimirRestartingTooOften
annotations:
description: '{{`Mimir containers are restarting too often.`}}'
expr: |
increase(
kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container!="prometheus"}[1h]
) > 5
for: 5m
labels:
area: managedservices
# This label is used to ensure the alert go through even for non-stable installations
all_pipelines: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- alert: MimirComponentDown
annotations:
description: '{{`Mimir component : {{ $labels.service }} is down.`}}'
expr: count(up{app="mimir"} == 0) by (cluster_id, service) > 0
for: 5m
labels:
area: managedservices
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_scrape_timeout: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
Expand All @@ -47,7 +65,6 @@ spec:
for: 1h
labels:
area: managedservices
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand All @@ -62,7 +79,6 @@ spec:
for: 1h
labels:
area: managedservices
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
rule_files:
- loki.all.rules.yml
- loki.rules.yml

tests:
- interval: 1m
Expand Down Expand Up @@ -98,3 +98,29 @@ tests:
exp_annotations:
description: "Loki pod loki-compactor-676b8c897b-rq298 (namespace loki) sees 1 unhealthy ring members"
opsrecipe: "loki/"
- interval: 1m
input_series:
- series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="loki"}'
values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes
alert_rule_test:
- alertname: LokiRestartingTooOften
eval_time: 15m # should be OK after 15 minutes
exp_alerts:
- alertname: LokiRestartingTooOften
eval_time: 85m # After 85 minutes, should fire an alert for the t+85 error
exp_alerts:
- exp_labels:
all_pipelines: true
area: managedservices
cancel_if_outside_working_hours: "true"
cluster_type: management_cluster
namespace: loki
severity: page
team: atlas
topic: observability
exp_annotations:
description: Loki containers are restarting too often.
opsrecipe: loki/
- alertname: LokiRestartingTooOften
eval_time: 140m # After 140m minutes, all should be back to normal
exp_alerts:
32 changes: 28 additions & 4 deletions test/tests/providers/global/mimir.rules.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,9 @@ tests:
severity: page
team: atlas
topic: observability
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_scrape_timeout: "true"
cancel_if_outside_working_hours: "true"
cluster_id: gauss
exp_annotations:
Expand All @@ -95,7 +93,6 @@ tests:
exp_alerts:
- exp_labels:
area: managedservices
cancel_if_apiserver_down: "true"
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
Expand All @@ -119,7 +116,6 @@ tests:
exp_alerts:
- exp_labels:
area: managedservices
cancel_if_apiserver_down: "true"
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
Expand All @@ -135,3 +131,31 @@ tests:
description: "Mimir ruler is failing to process PrometheusRules."
- alertname: MimirRulerEventsFailed
eval_time: 160m
- interval: 1m
input_series:
- series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="mimir-ingester"}'
values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes
- series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="prometheus"}'
values: "0+5x180" # prometheus container restarts 5 times per minute for 180 minutes
alert_rule_test:
- alertname: MimirRestartingTooOften
eval_time: 15m # should be OK after 15 minutes
exp_alerts:
- alertname: MimirRestartingTooOften
eval_time: 85m # After 85 minutes, should fire an alert for the t+85 error
exp_alerts:
- exp_labels:
all_pipelines: true
area: managedservices
cancel_if_outside_working_hours: "true"
cluster_type: management_cluster
container: mimir-ingester
namespace: mimir
severity: page
team: atlas
topic: observability
exp_annotations:
description: Mimir containers are restarting too often.
- alertname: MimirRestartingTooOften
eval_time: 140m # After 140m minutes, all should be back to normal
exp_alerts:

0 comments on commit 31301ab

Please sign in to comment.