Skip to content

Commit

Permalink
ROX-25003: Add emailsender alerts (#272)
Browse files Browse the repository at this point in the history
* Add emailsender alerts

* Add rate to throttled metric
  • Loading branch information
kurlov authored Jul 22, 2024
1 parent 67f7f4f commit 9ebf21f
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 0 deletions.
53 changes: 53 additions & 0 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,59 @@ spec:
description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md"

- name: rhacs-emailsender
rules:
- alert: RHACSEmailsenderScrapeFailed
expr: |
(avg_over_time(up{pod=~"emailsender-.*"}[10m]) < 0.5 and ON(pod) kube_pod_container_status_ready{pod=~"emailsender-.*"} == 1) or absent(up{pod=~"emailsender-.*"})
for: 20m
labels:
severity: warning
annotations:
summary: "Prometheus unable to scrape metrics from target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}`."
description: "During the last 10 minutes, only `{{ $value | humanizePercentage }}` of scrapes of target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}` were successful. This alert is raised when less than 50% of scrapes are successful."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
- alert: RHACSEmailsenderContainerDown
expr: |
avg_over_time(kube_pod_container_status_ready{pod=~"emailsender-.*"}[10m]) < 0.5
for: 20m
labels:
severity: warning
annotations:
summary: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` is down or in a CrashLoopBackOff status."
description: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has been down or in a CrashLoopBackOff status for at least 10 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
- alert: RHACSEmailsenderContainerFrequentlyRestarting
expr: increase(kube_pod_container_status_restarts_total{pod=~"emailsender-.*"}[30m]) > 3
labels:
severity: warning
annotations:
summary: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times."
description: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
- alert: RHACSEmailsenderSendErrors
expr: |2
(rate(acs_emailsender_failed_send_email_total[10m])
/
rate(acs_emailsender_send_email_total[10m])) > 0.10
for: 5m
labels:
severity: warning
annotations:
summary: "Email Sender container failing sending emails"
description: "Email Sender has a send email error rate of {{ $value | humanizePercentage }} over the last 10 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-047-emailsender-ses-send-error.md"
- alert: RHACSEmailsenderThrottledSend
expr: |
rate(acs_emailsender_throttled_send_email_total[10m]) * 60 > 0
for: 10m
labels:
severity: warning
annotations:
summary: "Email Sender throttled sending for `{{ $labels.tenant_id }}` Central instance"
description: "Email Sender throttled `{{ $labels.tenant_id }}` Central {{ $value | humanize }} time(s) per minute over the last 10 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-048-emailsender-ses-send-throttled.md"

- name: tenant-resources
rules:
- expr: |
Expand Down
27 changes: 27 additions & 0 deletions resources/prometheus/unit_tests/RHACSEmailsenderContainerDown.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: kube_pod_container_status_ready{namespace="rhacs", pod="emailsender-123", container="emailsender"}
values: "1+0x10 0+0x50"
alert_rule_test:
- eval_time: 15m
alertname: RHACSEmailsenderContainerDown
exp_alerts: []
- eval_time: 40m
alertname: RHACSEmailsenderContainerDown
exp_alerts:
- exp_labels:
alertname: RHACSEmailsenderContainerDown
container: emailsender
namespace: rhacs
pod: emailsender-123
severity: warning
exp_annotations:
summary: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` is down or in a CrashLoopBackOff status."
description: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` has been down or in a CrashLoopBackOff status for at least 10 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: kube_pod_container_status_restarts_total{namespace="rhacs", pod="emailsender-123", container="emailsender"}
values: "0+0x30 1+1x10 4+1x20"
alert_rule_test:
- eval_time: 30m
alertname: RHACSEmailsenderContainerFrequentlyRestarting
exp_alerts: []
- eval_time: 60m
alertname: RHACSEmailsenderContainerFrequentlyRestarting
exp_alerts:
- exp_labels:
alertname: RHACSEmailsenderContainerFrequentlyRestarting
container: emailsender
namespace: rhacs
pod: emailsender-123
severity: warning
exp_annotations:
summary: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` restarted more than 3 times."
description: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` has restarted more than 3 times during the last 30 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: up{namespace="rhacs", pod="emailsender-123", instance="1.2.3.4:9090"}
values: "0+0x20 1+0x20"
- series: kube_pod_container_status_ready{namespace="rhacs", pod="emailsender-123"}
values: "1+0x40"
alert_rule_test:
- eval_time: 10m
alertname: RHACSEmailsenderScrapeFailed
exp_alerts: []
- eval_time: 25m
alertname: RHACSEmailsenderScrapeFailed
exp_alerts:
- exp_labels:
alertname: RHACSEmailsenderScrapeFailed
instance: 1.2.3.4:9090
namespace: rhacs
pod: emailsender-123
severity: warning
exp_annotations:
summary: "Prometheus unable to scrape metrics from target `emailsender-123` in namespace `rhacs`."
description: "During the last 10 minutes, only `45.45%` of scrapes of target `emailsender-123` in namespace `rhacs` were successful. This alert is raised when less than 50% of scrapes are successful."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
29 changes: 29 additions & 0 deletions resources/prometheus/unit_tests/RHACSEmailsenderSendErrors.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: acs_emailsender_failed_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender"}
values: "0+0x10 1+1x50"
- series: acs_emailsender_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender"}
values: "1+1x10 1+2x50"
alert_rule_test:
- eval_time: 15m
alertname: RHACSEmailsenderSendErrors
exp_alerts: []
- eval_time: 40m
alertname: RHACSEmailsenderSendErrors
exp_alerts:
- exp_labels:
alertname: RHACSEmailsenderSendErrors
container: emailsender
namespace: rhacs
pod: emailsender-123
severity: warning
exp_annotations:
summary: "Email Sender container failing sending emails"
description: "Email Sender has a send email error rate of 50% over the last 10 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-047-emailsender-ses-send-error.md"
28 changes: 28 additions & 0 deletions resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: acs_emailsender_throttled_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender", tenant_id="centralid"}
values: "0+0x10 2+1x60"
alert_rule_test:
- eval_time: 15m
alertname: RHACSEmailsenderThrottledSend
exp_alerts: []
- eval_time: 40m
alertname: RHACSEmailsenderThrottledSend
exp_alerts:
- exp_labels:
alertname: RHACSEmailsenderThrottledSend
container: emailsender
namespace: rhacs
pod: emailsender-123
tenant_id: centralid
severity: warning
exp_annotations:
summary: "Email Sender throttled sending for `centralid` Central instance"
description: "Email Sender throttled `centralid` Central 1 time(s) per minute over the last 10 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-048-emailsender-ses-send-throttled.md"

0 comments on commit 9ebf21f

Please sign in to comment.