diff --git a/resources/grafana/sources/rhacs-cluster-overview.json b/resources/grafana/sources/rhacs-cluster-overview.json index 19e277ce..4cb7ccba 100644 --- a/resources/grafana/sources/rhacs-cluster-overview.json +++ b/resources/grafana/sources/rhacs-cluster-overview.json @@ -728,7 +728,7 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "availability_zone:strictly_worker_nodes:cpu_limit_ratio", + "expr": "availability_zone:acscs_worker_nodes:cpu_limit_ratio", "interval": "", "legendFormat": "Limit / {{availability_zone}}", "range": true, @@ -740,7 +740,7 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "availability_zone:strictly_worker_nodes:cpu_request_ratio", + "expr": "availability_zone:acscs_worker_nodes:cpu_request_ratio", "hide": false, "interval": "", "legendFormat": "Request / {{availability_zone}}", @@ -837,7 +837,7 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "availability_zone:strictly_worker_nodes:memory_limit_ratio", + "expr": "availability_zone:acscs_worker_nodes:memory_limit_ratio", "interval": "", "legendFormat": "Limit / {{availability_zone}}", "range": true, @@ -849,7 +849,7 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "availability_zone:strictly_worker_nodes:memory_request_ratio", + "expr": "availability_zone:acscs_worker_nodes:memory_request_ratio", "hide": false, "interval": "", "legendFormat": "Request / {{availability_zone}}", diff --git a/resources/grafana/sources/rhacs-cluster-resource-adjustment.json b/resources/grafana/sources/rhacs-cluster-resource-adjustment.json index f21d417d..ebc6efac 100644 --- a/resources/grafana/sources/rhacs-cluster-resource-adjustment.json +++ b/resources/grafana/sources/rhacs-cluster-resource-adjustment.json @@ -910,7 +910,7 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "sum(strictly_worker_nodes)", + "expr": "sum(acscs_worker_nodes)", "legendFormat": "__auto", "range": true, "refId": "A" @@ -977,7 +977,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() strictly_worker_nodes) / sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() strictly_worker_nodes) / 1024 / 1024 / 1024", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() acscs_worker_nodes) / sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() acscs_worker_nodes) / 1024 / 1024 / 1024", "instant": false, "legendFormat": "__auto", "range": true, @@ -1044,7 +1044,7 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "sum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() strictly_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() strictly_worker_nodes) / 1024 / 1024 / 1024", + "expr": "sum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() acscs_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() acscs_worker_nodes) / 1024 / 1024 / 1024", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1149,7 +1149,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() strictly_worker_nodes)\n", + "expr": "sum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() acscs_worker_nodes)\n", "format": "time_series", "hide": false, "instant": false, @@ -1221,7 +1221,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() strictly_worker_nodes)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() acscs_worker_nodes)", "format": "time_series", "hide": false, "instant": false, @@ -1293,7 +1293,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() strictly_worker_nodes) / \nsum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() strictly_worker_nodes)\n ", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() acscs_worker_nodes) / \nsum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() acscs_worker_nodes)\n ", "format": "time_series", "hide": false, "instant": false, @@ -1425,7 +1425,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() strictly_worker_nodes) / \nsum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() strictly_worker_nodes)\n ", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() acscs_worker_nodes) / \nsum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() acscs_worker_nodes)\n ", "format": "time_series", "hide": false, "instant": false, @@ -1441,7 +1441,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() strictly_worker_nodes) \n ", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() acscs_worker_nodes) \n ", "format": "time_series", "hide": false, "instant": false, @@ -1537,7 +1537,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() strictly_worker_nodes)\n", + "expr": "sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() acscs_worker_nodes)\n", "format": "time_series", "hide": false, "instant": false, @@ -1609,7 +1609,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() strictly_worker_nodes)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() acscs_worker_nodes)", "format": "time_series", "hide": false, "instant": false, @@ -1681,7 +1681,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() strictly_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() strictly_worker_nodes)\n", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() acscs_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() acscs_worker_nodes)\n", "format": "time_series", "hide": false, "instant": false, @@ -1813,7 +1813,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() strictly_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() strictly_worker_nodes)\n", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() acscs_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() acscs_worker_nodes)\n", "format": "time_series", "hide": false, "instant": false, @@ -1828,7 +1828,7 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() strictly_worker_nodes)\n", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() acscs_worker_nodes)\n", "hide": false, "legendFormat": "cpu absolute", "range": true, diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 86d62a4b..974b65c4 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -547,76 +547,76 @@ spec: rhacs_instance_id: "{{ $labels.rhacs_instance_id }}" - name: az-resources rules: - - record: strictly_worker_nodes + - record: acscs_worker_nodes expr: | - kube_node_role{role="worker"} * on (node) (sum(kube_node_role) by (node) == 1) + kube_node_role{role="acscs-worker"} - record: node_availability_zone expr: | sum(label_replace(kube_node_labels, "availability_zone", "$1", "label_failure_domain_beta_kubernetes_io_zone", "(.*)")) by (availability_zone, node) > 0 - - record: memory_resource_requests:strictly_worker_nodes:by_availability_zone:sum + - record: memory_resource_requests:acscs_worker_nodes:by_availability_zone:sum expr: | sum( sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{resource="memory",observability="",job="kube-state-metrics"}) by (node) - * on (node) strictly_worker_nodes + * on (node) acscs_worker_nodes * on (node) group_left(availability_zone) node_availability_zone ) by (availability_zone) - - record: memory_resource_limits:strictly_worker_nodes:by_availability_zone:sum + - record: memory_resource_limits:acscs_worker_nodes:by_availability_zone:sum expr: | sum( sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{resource="memory",observability="",job="kube-state-metrics"}) by (node) - * on (node) strictly_worker_nodes + * on (node) acscs_worker_nodes * on (node) group_left(availability_zone) node_availability_zone ) by (availability_zone) - - record: cpu_resource_requests:strictly_worker_nodes:by_availability_zone:sum + - record: cpu_resource_requests:acscs_worker_nodes:by_availability_zone:sum expr: | sum( sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{resource="cpu", observability="",job="kube-state-metrics"}) by (node) - * on (node) strictly_worker_nodes + * on (node) acscs_worker_nodes * on (node) group_left(availability_zone) node_availability_zone ) by (availability_zone) - - record: cpu_resource_limits:strictly_worker_nodes:by_availability_zone:sum + - record: cpu_resource_limits:acscs_worker_nodes:by_availability_zone:sum expr: | sum( sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{observability="",job="kube-state-metrics"}) by (node) - * on (node) strictly_worker_nodes + * on (node) acscs_worker_nodes * on (node) group_left(availability_zone) node_availability_zone ) by (availability_zone) - - record: availability_zone:strictly_worker_nodes:allocatable_cpu + - record: availability_zone:acscs_worker_nodes:allocatable_cpu expr: | sum( sum(kube_node_status_allocatable{resource="cpu"}) by (node) - * on (node) strictly_worker_nodes + * on (node) acscs_worker_nodes * on (node) group_left(availability_zone) node_availability_zone ) by (availability_zone) - - record: availability_zone:strictly_worker_nodes:allocatable_memory + - record: availability_zone:acscs_worker_nodes:allocatable_memory expr: | sum( sum(kube_node_status_allocatable{resource="memory"}) by (node) - * on (node) strictly_worker_nodes + * on (node) acscs_worker_nodes * on (node) group_left(availability_zone) node_availability_zone ) by (availability_zone) - - record: availability_zone:strictly_worker_nodes:memory_request_ratio + - record: availability_zone:acscs_worker_nodes:memory_request_ratio expr: | - memory_resource_requests:strictly_worker_nodes:by_availability_zone:sum + memory_resource_requests:acscs_worker_nodes:by_availability_zone:sum / - availability_zone:strictly_worker_nodes:allocatable_memory - - record: availability_zone:strictly_worker_nodes:cpu_request_ratio + availability_zone:acscs_worker_nodes:allocatable_memory + - record: availability_zone:acscs_worker_nodes:cpu_request_ratio expr: | - cpu_resource_requests:strictly_worker_nodes:by_availability_zone:sum + cpu_resource_requests:acscs_worker_nodes:by_availability_zone:sum / - availability_zone:strictly_worker_nodes:allocatable_cpu - - record: availability_zone:strictly_worker_nodes:memory_limit_ratio + availability_zone:acscs_worker_nodes:allocatable_cpu + - record: availability_zone:acscs_worker_nodes:memory_limit_ratio expr: | - memory_resource_limits:strictly_worker_nodes:by_availability_zone:sum + memory_resource_limits:acscs_worker_nodes:by_availability_zone:sum / - availability_zone:strictly_worker_nodes:allocatable_memory - - record: availability_zone:strictly_worker_nodes:cpu_limit_ratio + availability_zone:acscs_worker_nodes:allocatable_memory + - record: availability_zone:acscs_worker_nodes:cpu_limit_ratio expr: | - cpu_resource_limits:strictly_worker_nodes:by_availability_zone:sum + cpu_resource_limits:acscs_worker_nodes:by_availability_zone:sum / - availability_zone:strictly_worker_nodes:allocatable_cpu + availability_zone:acscs_worker_nodes:allocatable_cpu - alert: WorkerNodesMemoryQuotaOverCommit - expr: avg(availability_zone:strictly_worker_nodes:memory_request_ratio) > 0.8 + expr: avg(availability_zone:acscs_worker_nodes:memory_request_ratio) > 0.8 for: 5m labels: severity: critical @@ -625,7 +625,7 @@ spec: description: "During the last 5 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 80%." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" - alert: WorkerNodesCPUQuotaOverCommit - expr: avg(availability_zone:strictly_worker_nodes:cpu_request_ratio) > 0.8 + expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.8 for: 5m labels: severity: critical @@ -634,7 +634,7 @@ spec: description: "During the last 5 minutes, the average CPU request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 80%." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" - alert: WorkerNodesMemoryOverCommit - expr: avg(availability_zone:strictly_worker_nodes:memory_limit_ratio) > 2 + expr: avg(availability_zone:acscs_worker_nodes:memory_limit_ratio) > 2 for: 5m labels: severity: critical diff --git a/resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml b/resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml index e4425f86..4648c0b9 100644 --- a/resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml +++ b/resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml @@ -6,7 +6,7 @@ evaluation_interval: 1m tests: - interval: 1m input_series: - - series: kube_node_role{node="worker-1", role="worker"} + - series: kube_node_role{node="worker-1", role="acscs-worker"} values: "1" - series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"} values: "1" diff --git a/resources/prometheus/unit_tests/WorkerNodesMemoryOverCommit.yaml b/resources/prometheus/unit_tests/WorkerNodesMemoryOverCommit.yaml index 8dbb66a1..e22392bc 100644 --- a/resources/prometheus/unit_tests/WorkerNodesMemoryOverCommit.yaml +++ b/resources/prometheus/unit_tests/WorkerNodesMemoryOverCommit.yaml @@ -6,7 +6,7 @@ evaluation_interval: 1m tests: - interval: 1m input_series: - - series: kube_node_role{node="worker-1", role="worker"} + - series: kube_node_role{node="worker-1", role="acscs-worker"} values: "1" - series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"} values: "1" diff --git a/resources/prometheus/unit_tests/WorkerNodesMemoryQuotaOverCommit.yaml b/resources/prometheus/unit_tests/WorkerNodesMemoryQuotaOverCommit.yaml index 172a540d..52374d01 100644 --- a/resources/prometheus/unit_tests/WorkerNodesMemoryQuotaOverCommit.yaml +++ b/resources/prometheus/unit_tests/WorkerNodesMemoryQuotaOverCommit.yaml @@ -6,7 +6,7 @@ evaluation_interval: 1m tests: - interval: 1m input_series: - - series: kube_node_role{node="worker-1", role="worker"} + - series: kube_node_role{node="worker-1", role="acscs-worker"} values: "1" - series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"} values: "1"