Skip to content

Commit

Permalink
Ignore etcd alerts on EKS (#918)
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson authored Oct 2, 2023
1 parent 7be52c8 commit cb2d482
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed

- Handover cert-manager alerts to BigMac
- Ignore ETCD alerts on EKS clusters.

## [2.134.1] - 2023-09-26

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ spec:
annotations:
description: '{{`Etcd volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#etcd-volume
expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} < 10
expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} < 10
for: 10m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ spec:
annotations:
description: '{{`Etcd volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#etcd-volume
expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} < 10
expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} < 10
for: 10m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ spec:
annotations:
description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}'
opsrecipe: etcd-high-commit-duration/
expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster"}[5m])) > 1.0
expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster", provider!~"eks"}[5m])) > 1.0
for: 15m
labels:
area: kaas
Expand All @@ -27,7 +27,7 @@ spec:
annotations:
description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}'
opsrecipe: etcd-db-size-too-large/
expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster"}) * 100 > 80
expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster", provider!~"eks"}) * 100 > 80
for: 90m
labels:
area: kaas
Expand All @@ -38,7 +38,7 @@ spec:
- alert: ManagementClusterEtcdNumberOfLeaderChangesTooHigh
annotations:
description: '{{`Etcd has too many leader changes.`}}'
expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster"}[1h]) > 8
expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster", provider!~"eks"}[1h]) > 8
labels:
area: kaas
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
Expand All @@ -49,7 +49,7 @@ spec:
annotations:
description: '{{`Etcd has no leader.`}}'
opsrecipe: etcd-has-no-leader/
expr: etcd_server_has_leader{role=~"master|control-plane", cluster_type="management_cluster"} == 0
expr: etcd_server_has_leader{role=~"master|control-plane", cluster_type="management_cluster", provider!~"eks"} == 0
for: 5m
labels:
area: kaas
Expand All @@ -61,7 +61,7 @@ spec:
annotations:
description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}'
opsrecipe: etcd-metrics-missing/
expr: count(up{cluster_type="management_cluster"}) by (cluster_id) unless count(etcd_server_id) by (cluster_id)
expr: count(up{cluster_type="management_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{provider!~"eks"}) by (cluster_id)
for: 1h
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ spec:
annotations:
description: '{{`Etcd ({{ $labels.instance }}) on workload cluster {{ $labels.cluster_id }} is down.`}}'
opsrecipe: etcd-down/
expr: up{cluster_type="workload_cluster", app="etcd"} == 0
expr: up{cluster_type="workload_cluster", app="etcd", provider!~"eks"} == 0
for: 20m
labels:
area: kaas
Expand All @@ -31,7 +31,7 @@ spec:
annotations:
description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}'
opsrecipe: etcd-high-commit-duration/
expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster"}[5m])) > 1.0
expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster", provider!~"eks"}[5m])) > 1.0
for: 15m
labels:
area: kaas
Expand All @@ -43,7 +43,7 @@ spec:
annotations:
description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}'
opsrecipe: etcd-db-size-too-large/
expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster"}) * 100 > 80
expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster", provider!~"eks"}) * 100 > 80
for: 15m
labels:
area: kaas
Expand All @@ -54,7 +54,7 @@ spec:
- alert: WorkloadClusterEtcdNumberOfLeaderChangesTooHigh
annotations:
description: '{{`Etcd has too many leader changes.`}}'
expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster"}[1h]) > 8
expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster", provider!~"eks"}[1h]) > 8
labels:
area: kaas
severity: notify
Expand All @@ -64,7 +64,7 @@ spec:
annotations:
description: '{{`Etcd has no leader.`}}'
opsrecipe: etcd-has-no-leader/
expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail"} == 0
expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail", provider!~"eks"} == 0
for: 35m
labels:
area: kaas
Expand All @@ -76,7 +76,7 @@ spec:
annotations:
description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}'
opsrecipe: etcd-metrics-missing/
expr: count(up{cluster_type="workload_cluster"}) by (cluster_id) unless count(etcd_server_id) by (cluster_id)
expr: count(up{cluster_type="workload_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{provider!~"eks"}) by (cluster_id)
for: 1h
labels:
area: kaas
Expand Down

0 comments on commit cb2d482

Please sign in to comment.