From 9cf0d30c8910d116859906475ca84ef9b3321fa4 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 7 Nov 2023 18:18:29 +0100 Subject: [PATCH] Add keda alerting rules (#942) * Add keda alerting rules Signed-off-by: QuentinBisson * move to atlas and make alerts notify only --------- Signed-off-by: QuentinBisson --- CHANGELOG.md | 4 ++ .../templates/alerting-rules/keda.rules.yml | 72 +++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 helm/prometheus-rules/templates/alerting-rules/keda.rules.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 9248f56d6..463467c46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add KEDA alerting rules. + ### Changed - Added `namespace` label to Flux helm release related alerts diff --git a/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml b/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml new file mode 100644 index 000000000..abd0b8880 --- /dev/null +++ b/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml @@ -0,0 +1,72 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: keda.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: Keda + rules: + - alert: KedaDown + annotations: + description: 'Keda is down.' + expr: count (up{container=~"keda-.*"} == 0) > 0 + for: 10m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: atlas + topic: autoscaling + - alert: KedaScaledObjectErrors + annotations: + description: '{{`Errors detected in scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}' + expr: increase(keda_scaled_object_errors[10m])> 0 + for: 15m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: atlas + topic: autoscaling + - alert: KedaWebhookScaledObjectValidationErrors + annotations: + description: '{{`Validation errors detected in webhook for scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}' + expr: increase(keda_webhook_scaled_object_validation_errors[10m]) > 0 + for: 15m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: atlas + topic: autoscaling + - alert: KedaScalerErrors + annotations: + description: '{{`Errors detected in scaler {{ $labels.scaler }} for scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}' + expr: increase(keda_scaler_errors[10m]) > 0 + for: 15m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: atlas + topic: autoscaling