Skip to content

Commit

Permalink
Add heartbeat alert for mimir (#1094)
Browse files Browse the repository at this point in the history
* Add heartbeat alert for mimir

* Add mimir.enabled flag
  • Loading branch information
QuentinBisson authored Apr 2, 2024
1 parent 177d0e3 commit ea2f7c1
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Add Heartbeat alert for mimir.
- Add missing alert about loki containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124).
- Add missing alert about mimir containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124).
- Add recording rule for ingresses using the baseDomain.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
topic: kubernetes
annotations:
description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
# TODO: fix with real expr
# TODO(@team-turtles): fix with real expr
- alert: ScrapeTimeout
annotations:
description: '{{`Never fires (dummy alert).`}}'
Expand All @@ -38,7 +38,7 @@ spec:
scrape_timeout: "true"
team: phoenix
topic: monitoring
# TODO: fix with real expr
# TODO(@team-turtles): fix with real expr
- alert: ApiServerDown
annotations:
description: '{{`Never fires (dummy alert).`}}'
Expand Down
14 changes: 14 additions & 0 deletions helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{{- if .Values.mimir.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -9,6 +10,18 @@ spec:
groups:
- name: mimir
rules:
# This alert is meant to always fire, to ensure the entire alerting pipeline is functional.
- alert: "Heartbeat"
annotations:
description: This alert is used to ensure the entire alerting pipeline is functional.
expr: up{app="mimir"} > 0
labels:
area: "empowerment"
installation: {{ .Values.managementCluster.name }}
# TODO(@team-atlas): We need this label as long as we have the old and new heartbeats. Let's remove once the legacy monitoring is gone
type: "mimir-heartbeat"
team: "atlas"
topic: "observability"
# Coming from https://github.com/giantswarm/giantswarm/issues/30124
# This alert ensures Mimir containers are not restarting too often (flappiness).
# If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive).
Expand Down Expand Up @@ -71,3 +84,4 @@ spec:
severity: page
team: atlas
topic: observability
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ spec:
# Dex operator metrics for expiry time of identity provider oauth app secrets
- expr: dex_operator_idp_secret_expiry_time
record: aggregation:dex_operator_idp_secret_expiry_time
# Requests to the deprecated k8s authenticator. TODO: Get rid of this recording rule when the component is no longer used.
# Requests to the deprecated k8s authenticator. TODO(@team-bigmac): Get rid of this recording rule when the component is no longer used.
- expr: nginx_ingress_controller_requests{ingress="dex-k8s-authenticator"}
record: aggregation:dex_k8s_authenticator_requests
- name: grafana.grafana-cloud.recording
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{{- if .Values.mimir.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand Down Expand Up @@ -577,3 +578,4 @@ spec:
- expr: |
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m
{{- end }}
2 changes: 1 addition & 1 deletion test/hack/bin/check-opsrecipes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
set -euo pipefail

# List of generated rules
RULES_FILES=(./test/hack/output/*/prometheus-rules/templates/alerting-rules/*)
RULES_FILES=(./test/hack/output/*/*/prometheus-rules/templates/alerting-rules/*)
#RULES_FILES=(./test/hack/output/*/prometheus-rules/templates/alerting-rules/up*)

DEBUG_MODE=false
Expand Down
49 changes: 49 additions & 0 deletions test/tests/providers/global/mimir.rules.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,55 @@ rule_files:
- mimir.rules.yml

tests:
- interval: 1m
input_series:
# For the first 60min: test with 1 pod: up, none, up, down, up
- series: 'up{app="mimir"}'
values: "1+0x60 _x30 1+0x30 0+0x30 1+0x30"
alert_rule_test:
- alertname: Heartbeat
eval_time: 20m
exp_alerts:
- exp_labels:
app: mimir
area: empowerment
installation: myinstall
namespace: monitoring
team: atlas
topic: observability
type: heartbeat
exp_annotations:
description: "This alert is used to ensure the entire alerting pipeline is functional."
- alertname: Heartbeat
eval_time: 70m
- alertname: Heartbeat
eval_time: 95m
exp_alerts:
- exp_labels:
app: mimir
area: empowerment
installation: myinstall
namespace: monitoring
team: atlas
topic: observability
type: heartbeat
exp_annotations:
description: "This alert is used to ensure the entire alerting pipeline is functional."
- alertname: Heartbeat
eval_time: 140m
- alertname: Heartbeat
eval_time: 165m
exp_alerts:
- exp_labels:
app: mimir
area: empowerment
installation: myinstall
namespace: monitoring
team: atlas
topic: observability
type: heartbeat
exp_annotations:
description: "This alert is used to ensure the entire alerting pipeline is functional."
- interval: 1m
input_series:
# For the first 60min: test with 1 pod: none, up, down
Expand Down

0 comments on commit ea2f7c1

Please sign in to comment.