diff --git a/CHANGELOG.md b/CHANGELOG.md index deb9853d7..d4cc34618 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Updated `ContainerdVolumeSpaceTooLow`, `KubeletVolumeSpaceTooLow` and `LogVolumeSpaceTooLow` alerts to not trigger when the node-problem-detector is already remediating the issue. + ## [4.1.1] - 2024-05-30 ### Changed diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/disk.management-cluster.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/disk.management-cluster.rules.yml index 11bd02aa1..6608308c7 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/disk.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/shared/alerting-rules/disk.management-cluster.rules.yml @@ -27,9 +27,11 @@ spec: topic: storage - alert: ContainerdVolumeSpaceTooLow annotations: - description: '{{`Containerd volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}' + description: '{{`Containerd volume /var/lib/containerd on {{ $labels.node }} does not have enough free space.`}}' opsrecipe: low-disk-space/#containerd-volume - expr: 100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} < 10 + # See below comment for the KubeletVolumeSpaceTooLow alert regarding the node-problem-detector. + # We are also alerted if the free space is less than 10% for 10 minutes. + expr: (( 100 * (node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} +1) / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} < 10) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="ContainerdDiskIsFull"}) or sum ((100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} +1)/ node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} < 10) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0 for: 10m labels: area: kaas @@ -51,9 +53,14 @@ spec: topic: storage - alert: KubeletVolumeSpaceTooLow annotations: - description: '{{`Kubelet volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}' - opsrecipe: low-disk-space/#root-volume - expr: node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/kubelet"} < (2 * 1024 * 1024 * 1024) + description: '{{`Kubelet volume /var/lib/kubelet on {{ $labels.node }} does not have enough free space.`}}' + opsrecipe: low-disk-space/#kubelet-volume + # In clusters where the node-problem-detector-app (https://github.com/giantswarm/node-problem-detector-app/) is installed, we don't want to get alerted if the node-problem-detector is already remediating the issue. + # When this happens, the problem_gauge metric has value 1, so we do a multiply join on that metric - 1 to get 0 when the metric is present and active, and keep the series values that are > 0. + # The right hand side of the or is necessary because we need to be alerted in clusters without the node-problem-detector. + # Note that we add 1 to the disk free space so we still get alerted when the free bytes are 0. + # We are also alerted if the free space is less than 2GB for 10 minutes. + expr: (( node_filesystem_free_bytes{cluster_type="management_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="KubeletDiskIsFull"}) or sum (node_filesystem_free_bytes{cluster_type="management_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0 for: 10m labels: area: kaas @@ -63,9 +70,11 @@ spec: topic: storage - alert: LogVolumeSpaceTooLow annotations: - description: '{{`Log volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}' - opsrecipe: low-disk-space/#root-volume - expr: 100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} < 20 + description: '{{`Log volume /var/log on {{ $labels.node }} does not have enough free space.`}}' + opsrecipe: low-disk-space/#log-volume + # See above comment for the KubeletVolumeSpaceTooLow alert regarding the node-problem-detector. + # We are also alerted if the free space is less than 10% for 30 minutes. + expr: (( 100 * (node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} +1) / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="VarLogDiskIsFull"}) or sum ((100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} +1)/ node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0 for: 30m labels: area: kaas diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/disk.workload-cluster.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/disk.workload-cluster.rules.yml index d476765e2..4ac6a50fd 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/disk.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/shared/alerting-rules/disk.workload-cluster.rules.yml @@ -38,9 +38,14 @@ spec: topic: storage - alert: KubeletVolumeSpaceTooLow annotations: - description: '{{`Kubelet volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}' - opsrecipe: low-disk-space/#root-volume - expr: node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/lib/kubelet"} < (2 * 1024 * 1024 * 1024) + description: '{{`Kubelet volume /var/lib/kubelet on {{ $labels.node }} does not have enough free space.`}}' + opsrecipe: low-disk-space/#kubelet-volume + # In clusters where the node-problem-detector-app (https://github.com/giantswarm/node-problem-detector-app/) is installed, we don't want to get alerted if the node-problem-detector is already remediating the issue. + # When this happens, the problem_gauge metric has value 1, so we do a multiply join on that metric - 1 to get 0 when the metric is present and active, and keep the series values that are > 0. + # The right hand side of the or is necessary because we need to be alerted in clusters without the node-problem-detector. + # Note that we add 1 to the disk free space so we still get alerted when the free bytes are 0. + # We are also alerted if the free space is less than 2GB for 30 minutes. + expr: (( node_filesystem_free_bytes{cluster_type="workload_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="KubeletDiskIsFull"}) or sum (node_filesystem_free_bytes{cluster_type="workload_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0 for: 30m labels: area: kaas @@ -49,9 +54,11 @@ spec: topic: storage - alert: LogVolumeSpaceTooLow annotations: - description: '{{`Log volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}' - opsrecipe: low-disk-space/#root-volume - expr: 100 * node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} / node_filesystem_size_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} < 20 + description: '{{`Log volume /var/log on {{ $labels.node }} does not have enough free space.`}}' + opsrecipe: low-disk-space/#log-volume + # See above comment for the KubeletVolumeSpaceTooLow alert regarding the node-problem-detector. + # We are also alerted if the free space is less than 10% for 30 minutes. + expr: (( 100 * (node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} +1) / node_filesystem_size_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="VarLogDiskIsFull"}) or sum ((100 * node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} +1)/ node_filesystem_size_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0 for: 30m labels: area: kaas