From ce482af2172d74379fcad533cf0b4c3e373f9411 Mon Sep 17 00:00:00 2001 From: Jose Armesto Date: Tue, 29 Oct 2024 16:43:00 +0100 Subject: [PATCH] Only page when there are less than 500MB left on kubelet volume --- CHANGELOG.md | 4 ++++ .../turtles/alerting-rules/storage.workload-cluster.rules.yml | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 29262191..1108d2d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Change `KubeletVolumeSpaceTooLow` to only page when there are 500MB or less of space left, letting the node-problem-detector handle the rest. + ## [4.21.1] - 2024-10-25 ### Fixed diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml index e3c82b0f..72b7d6e0 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml @@ -44,8 +44,8 @@ spec: # When this happens, the problem_gauge metric has value 1, so we do a multiply join on that metric - 1 to get 0 when the metric is present and active, and keep the series values that are > 0. # The right hand side of the or is necessary because we need to be alerted in clusters without the node-problem-detector. # Note that we add 1 to the disk free space so we still get alerted when the free bytes are 0. - # We are also alerted if the free space is less than 2GB for 30 minutes. - expr: (( node_filesystem_free_bytes{cluster_type="workload_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="KubeletDiskIsFull"}) or sum (node_filesystem_free_bytes{cluster_type="workload_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0 + # We are also alerted if the free space is less than 500MB for 30 minutes. + expr: (( node_filesystem_free_bytes{cluster_type="workload_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (500 * 1024 * 1024)) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="KubeletDiskIsFull"}) or sum (node_filesystem_free_bytes{cluster_type="workload_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (500 * 1024 * 1024)) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0 for: 60m labels: area: kaas