From 51891a20bec8e89f0ec914c1cdab6a6d33ba7c96 Mon Sep 17 00:00:00 2001 From: Ludovic Cleroux Date: Thu, 16 Nov 2023 15:59:24 +0100 Subject: [PATCH 1/2] ROX-20792: Add critical alert for operator OOM --- resources/prometheus/prometheus-rules.yaml | 11 ++++++++ .../RHACSOperatorMemoryUtilizationHigh.yaml | 28 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 3e09833d..4b2be4b0 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -188,6 +188,17 @@ spec: summary: RHACS Operator '{{ $labels.workload }}' is reaching its memory limit. description: The RHACS operator '{{ $labels.workload }}' reached {{ $value | humanizePercentage }} of its memory limit and is at risk of being OOM killed. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-037-operator-memory-high.md" + - alert: RHACSOperatorMemoryUtilizationCritical + expr: | + rhacs_operator:namespace:workload:max_memory_usage_ratio > 0.9 + for: 5m + labels: + severity: critical + annotations: + summary: RHACS Operator '{{ $labels.workload }}' is critically close to its memory limit. + description: The RHACS operator '{{ $labels.workload }}' reached {{ $value | humanizePercentage }} of its memory limit and is at high risk of being OOM killed. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-037-operator-memory-high.md" + - name: rhacs-aws-quota rules: diff --git a/resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml b/resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml index 42d1fe46..8a01d044 100644 --- a/resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml +++ b/resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml @@ -32,3 +32,31 @@ tests: description: "The RHACS operator 'operator-workload' reached 70% of its memory limit and is at risk of being OOM killed." summary: "RHACS Operator 'operator-workload' is reaching its memory limit." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-037-operator-memory-high.md" + - interval: 1m + input_series: + - series: kube_pod_info{namespace="rhacs", pod="operator-pod"} + values: "1+0x20" + - series: kube_pod_labels{namespace="rhacs", pod="operator-pod", label_app="rhacs-operator"} + values: "1+0x20" + - series: namespace_workload_pod:kube_pod_owner:relabel{namespace="rhacs", pod="operator-pod", workload="operator-workload"} + values: "1+0x20" + - series: container_memory_max_usage_bytes{namespace="rhacs", pod="operator-pod", container="operator-container"} + values: "50+0x10 95+0x10" + - series: container_spec_memory_limit_bytes{namespace="rhacs", pod="operator-pod", container="operator-container"} + values: "100+0x20" + alert_rule_test: + - eval_time: 1m + alertname: RHACSOperatorMemoryUtilizationCritical + exp_alerts: [] + - eval_time: 17m + alertname: RHACSOperatorMemoryUtilizationCritical + exp_alerts: + - exp_labels: + alertname: RHACSOperatorMemoryUtilizationCritical + severity: critical + namespace: rhacs + workload: operator-workload + exp_annotations: + description: "The RHACS operator 'operator-workload' reached 95% of its memory limit and is at high risk of being OOM killed." + summary: "RHACS Operator 'operator-workload' is critically close to its memory limit." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-037-operator-memory-high.md" From 80b5395c965dbd4d505fe693e47cfbf2c4bcb806 Mon Sep 17 00:00:00 2001 From: Ludovic Cleroux Date: Thu, 16 Nov 2023 16:52:12 +0100 Subject: [PATCH 2/2] ROX-20792: Changed the alert to be by container --- resources/prometheus/prometheus-rules.yaml | 40 ++++++++++--------- .../RHACSOperatorMemoryUtilizationHigh.yaml | 32 +++++++-------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 4b2be4b0..f93b475f 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -162,41 +162,43 @@ spec: - name: rhacs-operator rules: - expr: | - sum (kube_pod_info{namespace="rhacs"} - * on (pod, namespace) group_left() kube_pod_labels{namespace="rhacs", label_app="rhacs-operator"} - * on (pod, namespace) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel) by (namespace, workload, pod) - record: rhacs_operator:namespace:workload:pod + sum (namespace_workload_pod:kube_pod_owner:relabel{namespace="rhacs"} + * on (pod, namespace) group_left() kube_pod_labels{namespace="rhacs", label_app="rhacs-operator"}) + by (pod, namespace, workload) + record: rhacs_operator:namespace:workload:pod:container - expr: | - rhacs_operator:namespace:workload:pod - * on (pod, namespace) group_left() sum(container_memory_max_usage_bytes{container!=""}) by (pod, namespace) - record: rhacs_operator:namespace:workload:pod:max_memory_usage_bytes + sum(container_memory_max_usage_bytes{namespace="rhacs",container!~"POD|"}) by (container, pod, namespace) + * on (namespace, pod) group_left(workload) rhacs_operator:namespace:workload:pod:container + record: rhacs_operator:namespace:workload:pod:container:max_memory_usage_bytes - expr: | - rhacs_operator:namespace:workload:pod - * on (pod, namespace) group_left() sum(container_spec_memory_limit_bytes{container!=""}) by (pod, namespace) - record: rhacs_operator:namespace:workload:pod:memory_limit_bytes + sum(container_spec_memory_limit_bytes{namespace="rhacs",container!~"POD|"}) by (container, pod, namespace) + * on (namespace, pod) group_left(workload) rhacs_operator:namespace:workload:pod:container + record: rhacs_operator:namespace:workload:pod:container:memory_limit_bytes - expr: | - sum(rhacs_operator:namespace:workload:pod:max_memory_usage_bytes / rhacs_operator:namespace:workload:pod:memory_limit_bytes) - by (namespace, workload) - record: rhacs_operator:namespace:workload:max_memory_usage_ratio + sum( + rhacs_operator:namespace:workload:pod:container:max_memory_usage_bytes + / rhacs_operator:namespace:workload:pod:container:memory_limit_bytes) + by (namespace, workload, container) + record: rhacs_operator:namespace:workload:container:max_memory_usage_ratio - alert: RHACSOperatorMemoryUtilizationHigh expr: | - rhacs_operator:namespace:workload:max_memory_usage_ratio > 0.6 + rhacs_operator:namespace:workload:container:max_memory_usage_ratio > 0.6 for: 5m labels: severity: warning annotations: - summary: RHACS Operator '{{ $labels.workload }}' is reaching its memory limit. - description: The RHACS operator '{{ $labels.workload }}' reached {{ $value | humanizePercentage }} of its memory limit and is at risk of being OOM killed. + summary: The container '{{ $labels.container }}' in operator '{{ $labels.workload }}' is reaching its memory limit. + description: The container '{{ $labels.container }}' in operator '{{ $labels.workload }}' reached {{ $value | humanizePercentage }} of its memory limit and is at risk of being OOM killed. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-037-operator-memory-high.md" - alert: RHACSOperatorMemoryUtilizationCritical expr: | - rhacs_operator:namespace:workload:max_memory_usage_ratio > 0.9 + rhacs_operator:namespace:workload:container:max_memory_usage_ratio > 0.9 for: 5m labels: severity: critical annotations: - summary: RHACS Operator '{{ $labels.workload }}' is critically close to its memory limit. - description: The RHACS operator '{{ $labels.workload }}' reached {{ $value | humanizePercentage }} of its memory limit and is at high risk of being OOM killed. + summary: The container '{{ $labels.container }}' in operator '{{ $labels.workload }}' is critically reaching its memory limit. + description: The container '{{ $labels.container }}' in operator '{{ $labels.workload }}' reached {{ $value | humanizePercentage }} of its memory limit and is at high risk of being OOM killed. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-037-operator-memory-high.md" diff --git a/resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml b/resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml index 8a01d044..70dd0fd2 100644 --- a/resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml +++ b/resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml @@ -6,15 +6,13 @@ evaluation_interval: 1m tests: - interval: 1m input_series: - - series: kube_pod_info{namespace="rhacs", pod="operator-pod"} + - series: namespace_workload_pod:kube_pod_owner:relabel{namespace="rhacs",workload="operator-workload",pod="operator-pod"} values: "1+0x20" - - series: kube_pod_labels{namespace="rhacs", pod="operator-pod", label_app="rhacs-operator"} + - series: kube_pod_labels{namespace="rhacs",label_app="rhacs-operator",pod="operator-pod"} values: "1+0x20" - - series: namespace_workload_pod:kube_pod_owner:relabel{namespace="rhacs", pod="operator-pod", workload="operator-workload"} - values: "1+0x20" - - series: container_memory_max_usage_bytes{namespace="rhacs", pod="operator-pod", container="operator-container"} + - series: container_memory_max_usage_bytes{namespace="rhacs", pod="operator-pod",container="manager"} values: "50+0x10 70+0x10" - - series: container_spec_memory_limit_bytes{namespace="rhacs", pod="operator-pod", container="operator-container"} + - series: container_spec_memory_limit_bytes{namespace="rhacs",pod="operator-pod",container="manager"} values: "100+0x20" alert_rule_test: - eval_time: 1m @@ -28,21 +26,20 @@ tests: severity: warning namespace: rhacs workload: operator-workload + container: manager exp_annotations: - description: "The RHACS operator 'operator-workload' reached 70% of its memory limit and is at risk of being OOM killed." - summary: "RHACS Operator 'operator-workload' is reaching its memory limit." + description: The container 'manager' in operator 'operator-workload' reached 70% of its memory limit and is at risk of being OOM killed. + summary: "The container 'manager' in operator 'operator-workload' is reaching its memory limit." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-037-operator-memory-high.md" - interval: 1m input_series: - - series: kube_pod_info{namespace="rhacs", pod="operator-pod"} - values: "1+0x20" - - series: kube_pod_labels{namespace="rhacs", pod="operator-pod", label_app="rhacs-operator"} + - series: namespace_workload_pod:kube_pod_owner:relabel{namespace="rhacs",workload="operator-workload",pod="operator-pod"} values: "1+0x20" - - series: namespace_workload_pod:kube_pod_owner:relabel{namespace="rhacs", pod="operator-pod", workload="operator-workload"} + - series: kube_pod_labels{namespace="rhacs",label_app="rhacs-operator",pod="operator-pod"} values: "1+0x20" - - series: container_memory_max_usage_bytes{namespace="rhacs", pod="operator-pod", container="operator-container"} - values: "50+0x10 95+0x10" - - series: container_spec_memory_limit_bytes{namespace="rhacs", pod="operator-pod", container="operator-container"} + - series: container_memory_max_usage_bytes{namespace="rhacs", pod="operator-pod",container="manager"} + values: "50+0x10 91+0x10" + - series: container_spec_memory_limit_bytes{namespace="rhacs",pod="operator-pod",container="manager"} values: "100+0x20" alert_rule_test: - eval_time: 1m @@ -56,7 +53,8 @@ tests: severity: critical namespace: rhacs workload: operator-workload + container: manager exp_annotations: - description: "The RHACS operator 'operator-workload' reached 95% of its memory limit and is at high risk of being OOM killed." - summary: "RHACS Operator 'operator-workload' is critically close to its memory limit." + description: The container 'manager' in operator 'operator-workload' reached 91% of its memory limit and is at high risk of being OOM killed. + summary: The container 'manager' in operator 'operator-workload' is critically reaching its memory limit. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-037-operator-memory-high.md"