From a2c0d1d9f219cb97ee932df23568ae0897ac5d19 Mon Sep 17 00:00:00 2001 From: rakeshgm Date: Wed, 26 Jul 2023 20:19:59 +0530 Subject: [PATCH] package ramen alerts VolumeSynchronizationDelay Signed-off-by: rakeshgm --- config/prometheus/alerts.yaml | 32 ++++++++++++++++++++++++++++ config/prometheus/kustomization.yaml | 1 + 2 files changed, 33 insertions(+) create mode 100644 config/prometheus/alerts.yaml diff --git a/config/prometheus/alerts.yaml b/config/prometheus/alerts.yaml new file mode 100644 index 0000000000..ccce73cd3d --- /dev/null +++ b/config/prometheus/alerts.yaml @@ -0,0 +1,32 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: alerting-rules + namespace: system +spec: + groups: + - name: alerts + rules: + - record: ramen_rpo_difference + expr: ramen_sync_duration_seconds{job="ramen-hub-operator-metrics-service"} / on(policyname, job) group_left() (ramen_policy_schedule_interval_seconds{job="ramen-hub-operator-metrics-service"}) + - alert: VolumeSynchronizationDelay + expr: ramen_rpo_difference >= 3 + for: 5s + labels: + cluster: "{{ $labels.cluster }}" + severity: critical + annotations: + description: "Syncing of volumes (DRPC: {{ $labels.obj_name }}, Namespace: {{ $labels.obj_namespace }}) is taking more than thrice the scheduled snapshot interval. This may cause data loss and a backlog of replication requests. To get around the delay, follow the instructions provided in the documentation." + alert_type: "DisasterRecovery" + - alert: VolumeSynchronizationDelay + expr: ramen_rpo_difference > 2 and ramen_rpo_difference < 3 + for: 5s + labels: + cluster: "{{ $labels.cluster }}" + severity: warning + annotations: + description: "Syncing of volumes (DRPC: {{ $labels.obj_name }}, Namespace: {{ $labels.obj_namespace }}) is taking more than twice the scheduled snapshot interval. This may cause data loss and impact replication requests. Check the documentation for instructions on how to get around the delay." + alert_type: "DisasterRecovery" diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml index ed137168a1..cf99431803 100644 --- a/config/prometheus/kustomization.yaml +++ b/config/prometheus/kustomization.yaml @@ -1,2 +1,3 @@ resources: - monitor.yaml +- alerts.yaml \ No newline at end of file