Skip to content

Commit

Permalink
Merge pull request #168 from stackrox/ROX-20792-add-alert-for-operato…
Browse files Browse the repository at this point in the history
…r-oom

ROX-20792: Add alert for operator OOM
  • Loading branch information
ludydoo authored Nov 16, 2023
2 parents d37893e + 89f95ee commit f20ed15
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 0 deletions.
31 changes: 31 additions & 0 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,37 @@ spec:
summary: "Fleetshard synchronizer manages `{{ $value }}` centrals."
description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md"

- name: rhacs-operator
rules:
- expr: |
sum (kube_pod_info{namespace="rhacs"}
* on (pod, namespace) group_left() kube_pod_labels{namespace="rhacs", label_app="rhacs-operator"}
* on (pod, namespace) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel) by (namespace, workload, pod)
record: rhacs_operator:namespace:workload:pod
- expr: |
rhacs_operator:namespace:workload:pod
* on (pod, namespace) group_left() sum(container_memory_max_usage_bytes{container!=""}) by (pod, namespace)
record: rhacs_operator:namespace:workload:pod:max_memory_usage_bytes
- expr: |
rhacs_operator:namespace:workload:pod
* on (pod, namespace) group_left() sum(container_spec_memory_limit_bytes{container!=""}) by (pod, namespace)
record: rhacs_operator:namespace:workload:pod:memory_limit_bytes
- expr: |
sum(rhacs_operator:namespace:workload:pod:max_memory_usage_bytes / rhacs_operator:namespace:workload:pod:memory_limit_bytes)
by (namespace, workload)
record: rhacs_operator:namespace:workload:max_memory_usage_ratio
- alert: RHACSOperatorMemoryUtilizationHigh
expr: |
rhacs_operator:namespace:workload:max_memory_usage_ratio > 0.6
for: 5m
labels:
severity: warning
annotations:
summary: RHACS Operator '{{ $labels.workload }}' is reaching its memory limit.
description: The RHACS operator '{{ $labels.workload }}' reached {{ $value | humanizePercentage }} of its memory limit and is at risk of being OOM killed.
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-037-operator-memory-high.md"

- name: rhacs-aws-quota
rules:
- alert: RHACSCentralDBClustersUtilizationHigh
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: kube_pod_info{namespace="rhacs", pod="operator-pod"}
values: "1+0x20"
- series: kube_pod_labels{namespace="rhacs", pod="operator-pod", label_app="rhacs-operator"}
values: "1+0x20"
- series: namespace_workload_pod:kube_pod_owner:relabel{namespace="rhacs", pod="operator-pod", workload="operator-workload"}
values: "1+0x20"
- series: container_memory_max_usage_bytes{namespace="rhacs", pod="operator-pod", container="operator-container"}
values: "50+0x10 70+0x10"
- series: container_spec_memory_limit_bytes{namespace="rhacs", pod="operator-pod", container="operator-container"}
values: "100+0x20"
alert_rule_test:
- eval_time: 1m
alertname: RHACSOperatorMemoryUtilizationHigh
exp_alerts: []
- eval_time: 16m
alertname: RHACSOperatorMemoryUtilizationHigh
exp_alerts:
- exp_labels:
alertname: RHACSOperatorMemoryUtilizationHigh
severity: warning
namespace: rhacs
workload: operator-workload
exp_annotations:
description: "The RHACS operator 'operator-workload' reached 70% of its memory limit and is at risk of being OOM killed."
summary: "RHACS Operator 'operator-workload' is reaching its memory limit."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-037-operator-memory-high.md"

0 comments on commit f20ed15

Please sign in to comment.