Skip to content

Commit

Permalink
Merge pull request #172 from stackrox/ROX-21046-alert-for-tenant-oom
Browse files Browse the repository at this point in the history
ROX-21046: Alerts for tenant nearing OOM
  • Loading branch information
ludydoo authored Dec 7, 2023
2 parents 60f26a4 + dc885ab commit 4f1d0c9
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 0 deletions.
32 changes: 32 additions & 0 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,38 @@ spec:
description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md"

- name: tenant-resources
rules:
- expr: |
sum(container_memory_working_set_bytes{namespace=~"rhacs-.{20}",container!="POD",container!=""}) by (namespace, container, pod)
record: rhacs_tenants:namespace:pod:container:max_memory_usage_bytes
- expr: |
sum(container_spec_memory_limit_bytes{namespace=~"rhacs-.{20}",container!="POD",container!=""}) by (namespace, container, pod)
record: rhacs_tenants:namespace:pod:container:memory_limit_bytes
- expr: |
rhacs_tenants:namespace:pod:container:max_memory_usage_bytes / rhacs_tenants:namespace:pod:container:memory_limit_bytes
record: rhacs_tenants:namespace:pod:container:max_memory_usage_ratio
- alert: RHACSTenantWorkloadMemoryUtilizationHigh
expr: |
rhacs_tenants:namespace:pod:container:max_memory_usage_ratio >= 0.75
for: 10m
labels:
severity: warning
annotations:
summary: tenant '{{ $labels.namespace }}' container '{{ $labels.container }}' in pod '{{ $labels.pod }}' is reaching its memory limit.
description: tenant '{{ $labels.namespace }}' container '{{ $labels.container }}' in pod '{{ $labels.pod }}' reached {{ $value | humanizePercentage }} of its memory limit and is at risk of being OOM killed.
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-039-tenant-workload-memory-utilization-high.md"
- alert: RHACSTenantWorkloadMemoryUtilizationCritical
expr: |
rhacs_tenants:namespace:pod:container:max_memory_usage_ratio >= 0.9
for: 10m
labels:
severity: critical
annotations:
summary: tenant '{{ $labels.namespace }}' container '{{ $labels.container }}' in pod '{{ $labels.pod }}' is critically reaching its memory limit.
description: tenant '{{ $labels.namespace }}' container '{{ $labels.container }}' in pod '{{ $labels.pod }}' reached {{ $value | humanizePercentage }} of its memory limit and is at high risk of being OOM killed.
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-039-tenant-workload-memory-utilization-high.md"

- name: rhacs-operator
rules:
- expr: |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: container_memory_working_set_bytes{namespace="rhacs-aaaaaaaaaaaaaaaaaaaa", pod="mypod", container="container-1"}
values: "50+0x10 75+0x10"
- series: container_spec_memory_limit_bytes{namespace="rhacs-aaaaaaaaaaaaaaaaaaaa",pod="mypod", container="container-1"}
values: "100+0x20"
alert_rule_test:
- eval_time: 1m
alertname: RHACSTenantWorkloadMemoryUtilizationHigh
exp_alerts: []
- eval_time: 21m
alertname: RHACSTenantWorkloadMemoryUtilizationHigh
exp_alerts:
- exp_labels:
alertname: RHACSTenantWorkloadMemoryUtilizationHigh
severity: warning
namespace: rhacs-aaaaaaaaaaaaaaaaaaaa
pod: mypod
container: container-1
exp_annotations:
summary: tenant 'rhacs-aaaaaaaaaaaaaaaaaaaa' container 'container-1' in pod 'mypod' is reaching its memory limit.
description: tenant 'rhacs-aaaaaaaaaaaaaaaaaaaa' container 'container-1' in pod 'mypod' reached 75% of its memory limit and is at risk of being OOM killed.
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-039-tenant-workload-memory-utilization-high.md"
- interval: 1m
input_series:
- series: container_memory_working_set_bytes{namespace="rhacs-aaaaaaaaaaaaaaaaaaaa",pod="mypod",container="container-1"}
values: "50+0x10 90+0x10"
- series: container_spec_memory_limit_bytes{namespace="rhacs-aaaaaaaaaaaaaaaaaaaa",pod="mypod",container="container-1"}
values: "100+0x20"
alert_rule_test:
- eval_time: 1m
alertname: RHACSTenantWorkloadMemoryUtilizationCritical
exp_alerts: []
- eval_time: 21m
alertname: RHACSTenantWorkloadMemoryUtilizationCritical
exp_alerts:
- exp_labels:
alertname: RHACSTenantWorkloadMemoryUtilizationCritical
severity: critical
namespace: rhacs-aaaaaaaaaaaaaaaaaaaa
pod: mypod
container: container-1
exp_annotations:
description: tenant 'rhacs-aaaaaaaaaaaaaaaaaaaa' container 'container-1' in pod 'mypod' reached 90% of its memory limit and is at high risk of being OOM killed.
summary: tenant 'rhacs-aaaaaaaaaaaaaaaaaaaa' container 'container-1' in pod 'mypod' is critically reaching its memory limit.
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-039-tenant-workload-memory-utilization-high.md"

0 comments on commit 4f1d0c9

Please sign in to comment.