From ea2f7c1403733dff5925227f4ba80983c503a84e Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 2 Apr 2024 11:42:12 +0200 Subject: [PATCH] Add heartbeat alert for mimir (#1094) * Add heartbeat alert for mimir * Add mimir.enabled flag --- CHANGELOG.md | 1 + .../alerting-rules/inhibit.all.rules.yml | 4 +- .../templates/alerting-rules/mimir.rules.yml | 14 ++++++ .../recording-rules/grafana-cloud.rules.yml | 2 +- .../recording-rules/mimir-mixins.rules.yml | 2 + test/hack/bin/check-opsrecipes.sh | 2 +- .../providers/global/mimir.rules.test.yml | 49 +++++++++++++++++++ 7 files changed, 70 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 92e66a423..2af5ec125 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Add Heartbeat alert for mimir. - Add missing alert about loki containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124). - Add missing alert about mimir containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124). - Add recording rule for ingresses using the baseDomain. diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml index 318ae2c44..6d26e61e8 100644 --- a/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml @@ -28,7 +28,7 @@ spec: topic: kubernetes annotations: description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}' - # TODO: fix with real expr + # TODO(@team-turtles): fix with real expr - alert: ScrapeTimeout annotations: description: '{{`Never fires (dummy alert).`}}' @@ -38,7 +38,7 @@ spec: scrape_timeout: "true" team: phoenix topic: monitoring - # TODO: fix with real expr + # TODO(@team-turtles): fix with real expr - alert: ApiServerDown annotations: description: '{{`Never fires (dummy alert).`}}' diff --git a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml index 9b1575dd1..15d5b7198 100644 --- a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml @@ -1,3 +1,4 @@ +{{- if .Values.mimir.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -9,6 +10,18 @@ spec: groups: - name: mimir rules: + # This alert is meant to always fire, to ensure the entire alerting pipeline is functional. + - alert: "Heartbeat" + annotations: + description: This alert is used to ensure the entire alerting pipeline is functional. + expr: up{app="mimir"} > 0 + labels: + area: "empowerment" + installation: {{ .Values.managementCluster.name }} + # TODO(@team-atlas): We need this label as long as we have the old and new heartbeats. Let's remove once the legacy monitoring is gone + type: "mimir-heartbeat" + team: "atlas" + topic: "observability" # Coming from https://github.com/giantswarm/giantswarm/issues/30124 # This alert ensures Mimir containers are not restarting too often (flappiness). # If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive). @@ -71,3 +84,4 @@ spec: severity: page team: atlas topic: observability +{{- end }} diff --git a/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml index 0d521c096..a192c2ad7 100644 --- a/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml @@ -383,7 +383,7 @@ spec: # Dex operator metrics for expiry time of identity provider oauth app secrets - expr: dex_operator_idp_secret_expiry_time record: aggregation:dex_operator_idp_secret_expiry_time - # Requests to the deprecated k8s authenticator. TODO: Get rid of this recording rule when the component is no longer used. + # Requests to the deprecated k8s authenticator. TODO(@team-bigmac): Get rid of this recording rule when the component is no longer used. - expr: nginx_ingress_controller_requests{ingress="dex-k8s-authenticator"} record: aggregation:dex_k8s_authenticator_requests - name: grafana.grafana-cloud.recording diff --git a/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml b/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml index e616271dc..b84038a46 100644 --- a/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml @@ -1,3 +1,4 @@ +{{- if .Values.mimir.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -577,3 +578,4 @@ spec: - expr: | sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m +{{- end }} diff --git a/test/hack/bin/check-opsrecipes.sh b/test/hack/bin/check-opsrecipes.sh index 1b72d1de7..af7384688 100755 --- a/test/hack/bin/check-opsrecipes.sh +++ b/test/hack/bin/check-opsrecipes.sh @@ -2,7 +2,7 @@ set -euo pipefail # List of generated rules -RULES_FILES=(./test/hack/output/*/prometheus-rules/templates/alerting-rules/*) +RULES_FILES=(./test/hack/output/*/*/prometheus-rules/templates/alerting-rules/*) #RULES_FILES=(./test/hack/output/*/prometheus-rules/templates/alerting-rules/up*) DEBUG_MODE=false diff --git a/test/tests/providers/global/mimir.rules.test.yml b/test/tests/providers/global/mimir.rules.test.yml index a5799b3a8..d67e708c4 100644 --- a/test/tests/providers/global/mimir.rules.test.yml +++ b/test/tests/providers/global/mimir.rules.test.yml @@ -3,6 +3,55 @@ rule_files: - mimir.rules.yml tests: + - interval: 1m + input_series: + # For the first 60min: test with 1 pod: up, none, up, down, up + - series: 'up{app="mimir"}' + values: "1+0x60 _x30 1+0x30 0+0x30 1+0x30" + alert_rule_test: + - alertname: Heartbeat + eval_time: 20m + exp_alerts: + - exp_labels: + app: mimir + area: empowerment + installation: myinstall + namespace: monitoring + team: atlas + topic: observability + type: heartbeat + exp_annotations: + description: "This alert is used to ensure the entire alerting pipeline is functional." + - alertname: Heartbeat + eval_time: 70m + - alertname: Heartbeat + eval_time: 95m + exp_alerts: + - exp_labels: + app: mimir + area: empowerment + installation: myinstall + namespace: monitoring + team: atlas + topic: observability + type: heartbeat + exp_annotations: + description: "This alert is used to ensure the entire alerting pipeline is functional." + - alertname: Heartbeat + eval_time: 140m + - alertname: Heartbeat + eval_time: 165m + exp_alerts: + - exp_labels: + app: mimir + area: empowerment + installation: myinstall + namespace: monitoring + team: atlas + topic: observability + type: heartbeat + exp_annotations: + description: "This alert is used to ensure the entire alerting pipeline is functional." - interval: 1m input_series: # For the first 60min: test with 1 pod: none, up, down