Skip to content

Commit

Permalink
Update disk alerts to not trigger when node-problem-detector is alrea…
Browse files Browse the repository at this point in the history
…dy remediating the issue. (#1203)
  • Loading branch information
weseven authored May 30, 2024
1 parent bddf46d commit a943a83
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 14 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Updated `ContainerdVolumeSpaceTooLow`, `KubeletVolumeSpaceTooLow` and `LogVolumeSpaceTooLow` alerts to not trigger when the node-problem-detector is already remediating the issue.

## [4.1.1] - 2024-05-30

### Changed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ spec:
topic: storage
- alert: ContainerdVolumeSpaceTooLow
annotations:
description: '{{`Containerd volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
description: '{{`Containerd volume /var/lib/containerd on {{ $labels.node }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#containerd-volume
expr: 100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} < 10
# See below comment for the KubeletVolumeSpaceTooLow alert regarding the node-problem-detector.
# We are also alerted if the free space is less than 10% for 10 minutes.
expr: (( 100 * (node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} +1) / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} < 10) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="ContainerdDiskIsFull"}) or sum ((100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} +1)/ node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} < 10) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0
for: 10m
labels:
area: kaas
Expand All @@ -51,9 +53,14 @@ spec:
topic: storage
- alert: KubeletVolumeSpaceTooLow
annotations:
description: '{{`Kubelet volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#root-volume
expr: node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/kubelet"} < (2 * 1024 * 1024 * 1024)
description: '{{`Kubelet volume /var/lib/kubelet on {{ $labels.node }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#kubelet-volume
# In clusters where the node-problem-detector-app (https://github.com/giantswarm/node-problem-detector-app/) is installed, we don't want to get alerted if the node-problem-detector is already remediating the issue.
# When this happens, the problem_gauge metric has value 1, so we do a multiply join on that metric - 1 to get 0 when the metric is present and active, and keep the series values that are > 0.
# The right hand side of the or is necessary because we need to be alerted in clusters without the node-problem-detector.
# Note that we add 1 to the disk free space so we still get alerted when the free bytes are 0.
# We are also alerted if the free space is less than 2GB for 10 minutes.
expr: (( node_filesystem_free_bytes{cluster_type="management_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="KubeletDiskIsFull"}) or sum (node_filesystem_free_bytes{cluster_type="management_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0
for: 10m
labels:
area: kaas
Expand All @@ -63,9 +70,11 @@ spec:
topic: storage
- alert: LogVolumeSpaceTooLow
annotations:
description: '{{`Log volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#root-volume
expr: 100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} < 20
description: '{{`Log volume /var/log on {{ $labels.node }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#log-volume
# See above comment for the KubeletVolumeSpaceTooLow alert regarding the node-problem-detector.
# We are also alerted if the free space is less than 10% for 30 minutes.
expr: (( 100 * (node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} +1) / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="VarLogDiskIsFull"}) or sum ((100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} +1)/ node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0
for: 30m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,14 @@ spec:
topic: storage
- alert: KubeletVolumeSpaceTooLow
annotations:
description: '{{`Kubelet volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#root-volume
expr: node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/lib/kubelet"} < (2 * 1024 * 1024 * 1024)
description: '{{`Kubelet volume /var/lib/kubelet on {{ $labels.node }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#kubelet-volume
# In clusters where the node-problem-detector-app (https://github.com/giantswarm/node-problem-detector-app/) is installed, we don't want to get alerted if the node-problem-detector is already remediating the issue.
# When this happens, the problem_gauge metric has value 1, so we do a multiply join on that metric - 1 to get 0 when the metric is present and active, and keep the series values that are > 0.
# The right hand side of the or is necessary because we need to be alerted in clusters without the node-problem-detector.
# Note that we add 1 to the disk free space so we still get alerted when the free bytes are 0.
# We are also alerted if the free space is less than 2GB for 30 minutes.
expr: (( node_filesystem_free_bytes{cluster_type="workload_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="KubeletDiskIsFull"}) or sum (node_filesystem_free_bytes{cluster_type="workload_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0
for: 30m
labels:
area: kaas
Expand All @@ -49,9 +54,11 @@ spec:
topic: storage
- alert: LogVolumeSpaceTooLow
annotations:
description: '{{`Log volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#root-volume
expr: 100 * node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} / node_filesystem_size_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} < 20
description: '{{`Log volume /var/log on {{ $labels.node }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#log-volume
# See above comment for the KubeletVolumeSpaceTooLow alert regarding the node-problem-detector.
# We are also alerted if the free space is less than 10% for 30 minutes.
expr: (( 100 * (node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} +1) / node_filesystem_size_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="VarLogDiskIsFull"}) or sum ((100 * node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} +1)/ node_filesystem_size_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0
for: 30m
labels:
area: kaas
Expand Down

0 comments on commit a943a83

Please sign in to comment.