Skip to content

Commit

Permalink
ROX-16643: Set acscs-worker role
Browse files Browse the repository at this point in the history
  • Loading branch information
ludydoo committed Nov 13, 2023
1 parent 8a5c5db commit d56c91b
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 47 deletions.
8 changes: 4 additions & 4 deletions resources/grafana/sources/rhacs-cluster-overview.json
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "availability_zone:strictly_worker_nodes:cpu_limit_ratio",
"expr": "availability_zone:acscs_worker_nodes:cpu_limit_ratio",
"interval": "",
"legendFormat": "Limit / {{availability_zone}}",
"range": true,
Expand All @@ -740,7 +740,7 @@
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "availability_zone:strictly_worker_nodes:cpu_request_ratio",
"expr": "availability_zone:acscs_worker_nodes:cpu_request_ratio",
"hide": false,
"interval": "",
"legendFormat": "Request / {{availability_zone}}",
Expand Down Expand Up @@ -837,7 +837,7 @@
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "availability_zone:strictly_worker_nodes:memory_limit_ratio",
"expr": "availability_zone:acscs_worker_nodes:memory_limit_ratio",
"interval": "",
"legendFormat": "Limit / {{availability_zone}}",
"range": true,
Expand All @@ -849,7 +849,7 @@
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "availability_zone:strictly_worker_nodes:memory_request_ratio",
"expr": "availability_zone:acscs_worker_nodes:memory_request_ratio",
"hide": false,
"interval": "",
"legendFormat": "Request / {{availability_zone}}",
Expand Down
26 changes: 13 additions & 13 deletions resources/grafana/sources/rhacs-cluster-resource-adjustment.json
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "sum(strictly_worker_nodes)",
"expr": "sum(acscs_worker_nodes)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down Expand Up @@ -977,7 +977,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() strictly_worker_nodes) / sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() strictly_worker_nodes) / 1024 / 1024 / 1024",
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() acscs_worker_nodes) / sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() acscs_worker_nodes) / 1024 / 1024 / 1024",
"instant": false,
"legendFormat": "__auto",
"range": true,
Expand Down Expand Up @@ -1044,7 +1044,7 @@
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "sum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() strictly_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() strictly_worker_nodes) / 1024 / 1024 / 1024",
"expr": "sum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() acscs_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() acscs_worker_nodes) / 1024 / 1024 / 1024",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down Expand Up @@ -1149,7 +1149,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() strictly_worker_nodes)\n",
"expr": "sum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() acscs_worker_nodes)\n",
"format": "time_series",
"hide": false,
"instant": false,
Expand Down Expand Up @@ -1221,7 +1221,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() strictly_worker_nodes)",
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() acscs_worker_nodes)",
"format": "time_series",
"hide": false,
"instant": false,
Expand Down Expand Up @@ -1293,7 +1293,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() strictly_worker_nodes) / \nsum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() strictly_worker_nodes)\n ",
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() acscs_worker_nodes) / \nsum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() acscs_worker_nodes)\n ",
"format": "time_series",
"hide": false,
"instant": false,
Expand Down Expand Up @@ -1425,7 +1425,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() strictly_worker_nodes) / \nsum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() strictly_worker_nodes)\n ",
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() acscs_worker_nodes) / \nsum(kube_node_status_capacity{resource=\"memory\", unit=\"byte\"} * on(node) group_left() acscs_worker_nodes)\n ",
"format": "time_series",
"hide": false,
"instant": false,
Expand All @@ -1441,7 +1441,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() strictly_worker_nodes) \n ",
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{container!=\"\"} * on (node) group_left() acscs_worker_nodes) \n ",
"format": "time_series",
"hide": false,
"instant": false,
Expand Down Expand Up @@ -1537,7 +1537,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() strictly_worker_nodes)\n",
"expr": "sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() acscs_worker_nodes)\n",
"format": "time_series",
"hide": false,
"instant": false,
Expand Down Expand Up @@ -1609,7 +1609,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() strictly_worker_nodes)",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() acscs_worker_nodes)",
"format": "time_series",
"hide": false,
"instant": false,
Expand Down Expand Up @@ -1681,7 +1681,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() strictly_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() strictly_worker_nodes)\n",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() acscs_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() acscs_worker_nodes)\n",
"format": "time_series",
"hide": false,
"instant": false,
Expand Down Expand Up @@ -1813,7 +1813,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() strictly_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() strictly_worker_nodes)\n",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() acscs_worker_nodes) / sum(kube_node_status_capacity{resource=\"cpu\", unit=\"core\"} * on(node) group_left() acscs_worker_nodes)\n",
"format": "time_series",
"hide": false,
"instant": false,
Expand All @@ -1828,7 +1828,7 @@
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() strictly_worker_nodes)\n",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate * on (node) group_left() acscs_worker_nodes)\n",
"hide": false,
"legendFormat": "cpu absolute",
"range": true,
Expand Down
58 changes: 29 additions & 29 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -547,76 +547,76 @@ spec:
rhacs_instance_id: "{{ $labels.rhacs_instance_id }}"
- name: az-resources
rules:
- record: strictly_worker_nodes
- record: acscs_worker_nodes
expr: |
kube_node_role{role="acscs-worker"} * on (node) (sum(kube_node_role) by (node) == 1)
kube_node_role{role="acscs-worker"}
- record: node_availability_zone
expr: |
sum(label_replace(kube_node_labels, "availability_zone", "$1", "label_failure_domain_beta_kubernetes_io_zone", "(.*)")) by (availability_zone, node) > 0
- record: memory_resource_requests:strictly_worker_nodes:by_availability_zone:sum
- record: memory_resource_requests:acscs_worker_nodes:by_availability_zone:sum
expr: |
sum(
sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{resource="memory",observability="",job="kube-state-metrics"}) by (node)
* on (node) strictly_worker_nodes
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: memory_resource_limits:strictly_worker_nodes:by_availability_zone:sum
- record: memory_resource_limits:acscs_worker_nodes:by_availability_zone:sum
expr: |
sum(
sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{resource="memory",observability="",job="kube-state-metrics"}) by (node)
* on (node) strictly_worker_nodes
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: cpu_resource_requests:strictly_worker_nodes:by_availability_zone:sum
- record: cpu_resource_requests:acscs_worker_nodes:by_availability_zone:sum
expr: |
sum(
sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{resource="cpu", observability="",job="kube-state-metrics"}) by (node)
* on (node) strictly_worker_nodes
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: cpu_resource_limits:strictly_worker_nodes:by_availability_zone:sum
- record: cpu_resource_limits:acscs_worker_nodes:by_availability_zone:sum
expr: |
sum(
sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{observability="",job="kube-state-metrics"}) by (node)
* on (node) strictly_worker_nodes
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: availability_zone:strictly_worker_nodes:allocatable_cpu
- record: availability_zone:acscs_worker_nodes:allocatable_cpu
expr: |
sum(
sum(kube_node_status_allocatable{resource="cpu"}) by (node)
* on (node) strictly_worker_nodes
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: availability_zone:strictly_worker_nodes:allocatable_memory
- record: availability_zone:acscs_worker_nodes:allocatable_memory
expr: |
sum(
sum(kube_node_status_allocatable{resource="memory"}) by (node)
* on (node) strictly_worker_nodes
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: availability_zone:strictly_worker_nodes:memory_request_ratio
- record: availability_zone:acscs_worker_nodes:memory_request_ratio
expr: |
memory_resource_requests:strictly_worker_nodes:by_availability_zone:sum
memory_resource_requests:acscs_worker_nodes:by_availability_zone:sum
/
availability_zone:strictly_worker_nodes:allocatable_memory
- record: availability_zone:strictly_worker_nodes:cpu_request_ratio
availability_zone:acscs_worker_nodes:allocatable_memory
- record: availability_zone:acscs_worker_nodes:cpu_request_ratio
expr: |
cpu_resource_requests:strictly_worker_nodes:by_availability_zone:sum
cpu_resource_requests:acscs_worker_nodes:by_availability_zone:sum
/
availability_zone:strictly_worker_nodes:allocatable_cpu
- record: availability_zone:strictly_worker_nodes:memory_limit_ratio
availability_zone:acscs_worker_nodes:allocatable_cpu
- record: availability_zone:acscs_worker_nodes:memory_limit_ratio
expr: |
memory_resource_limits:strictly_worker_nodes:by_availability_zone:sum
memory_resource_limits:acscs_worker_nodes:by_availability_zone:sum
/
availability_zone:strictly_worker_nodes:allocatable_memory
- record: availability_zone:strictly_worker_nodes:cpu_limit_ratio
availability_zone:acscs_worker_nodes:allocatable_memory
- record: availability_zone:acscs_worker_nodes:cpu_limit_ratio
expr: |
cpu_resource_limits:strictly_worker_nodes:by_availability_zone:sum
cpu_resource_limits:acscs_worker_nodes:by_availability_zone:sum
/
availability_zone:strictly_worker_nodes:allocatable_cpu
availability_zone:acscs_worker_nodes:allocatable_cpu
- alert: WorkerNodesMemoryQuotaOverCommit
expr: avg(availability_zone:strictly_worker_nodes:memory_request_ratio) > 0.8
expr: avg(availability_zone:acscs_worker_nodes:memory_request_ratio) > 0.8
for: 5m
labels:
severity: critical
Expand All @@ -625,7 +625,7 @@ spec:
description: "During the last 5 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 80%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- alert: WorkerNodesCPUQuotaOverCommit
expr: avg(availability_zone:strictly_worker_nodes:cpu_request_ratio) > 0.8
expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.8
for: 5m
labels:
severity: critical
Expand All @@ -634,7 +634,7 @@ spec:
description: "During the last 5 minutes, the average CPU request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 80%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- alert: WorkerNodesMemoryOverCommit
expr: avg(availability_zone:strictly_worker_nodes:memory_limit_ratio) > 2
expr: avg(availability_zone:acscs_worker_nodes:memory_limit_ratio) > 2
for: 5m
labels:
severity: critical
Expand Down
2 changes: 1 addition & 1 deletion scripts/test-prom-rules.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]:-$0}")" &>/dev/null && pwd 2

yq eval '.spec' "${SCRIPT_DIR}"/../resources/prometheus/prometheus-rules.yaml >/tmp/prometheus-rules-test.yaml
yq eval '.spec' "${SCRIPT_DIR}"/../resources/prometheus/rhacs-recording-rules.yaml >/tmp/recording-rules-test.yaml
for f in "${SCRIPT_DIR}"/../resources/prometheus/unit_tests/WorkerNodes*; do
for f in "${SCRIPT_DIR}"/../resources/prometheus/unit_tests/*; do
echo "$f"
promtool test rules "${f}"
done

0 comments on commit d56c91b

Please sign in to comment.