diff --git a/CHANGELOG.md b/CHANGELOG.md index 979164628..23685b5ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Handover cert-manager alerts to BigMac +- Ignore ETCD alerts on EKS clusters. ## [2.134.1] - 2023-09-26 diff --git a/helm/prometheus-rules/templates/alerting-rules/disk.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/disk.management-cluster.rules.yml index 73f787014..9a5d14d40 100644 --- a/helm/prometheus-rules/templates/alerting-rules/disk.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/disk.management-cluster.rules.yml @@ -39,7 +39,7 @@ spec: annotations: description: '{{`Etcd volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}' opsrecipe: low-disk-space/#etcd-volume - expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} < 10 + expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} < 10 for: 10m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/disk.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/disk.workload-cluster.rules.yml index b45a7e0a7..68eff7115 100644 --- a/helm/prometheus-rules/templates/alerting-rules/disk.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/disk.workload-cluster.rules.yml @@ -26,7 +26,7 @@ spec: annotations: description: '{{`Etcd volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}' opsrecipe: low-disk-space/#etcd-volume - expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} < 10 + expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} < 10 for: 10m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml index 04902f685..5d867ca21 100644 --- a/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml @@ -15,7 +15,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}' opsrecipe: etcd-high-commit-duration/ - expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster"}[5m])) > 1.0 + expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster", provider!~"eks"}[5m])) > 1.0 for: 15m labels: area: kaas @@ -27,7 +27,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}' opsrecipe: etcd-db-size-too-large/ - expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster"}) * 100 > 80 + expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster", provider!~"eks"}) * 100 > 80 for: 90m labels: area: kaas @@ -38,7 +38,7 @@ spec: - alert: ManagementClusterEtcdNumberOfLeaderChangesTooHigh annotations: description: '{{`Etcd has too many leader changes.`}}' - expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster"}[1h]) > 8 + expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster", provider!~"eks"}[1h]) > 8 labels: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} @@ -49,7 +49,7 @@ spec: annotations: description: '{{`Etcd has no leader.`}}' opsrecipe: etcd-has-no-leader/ - expr: etcd_server_has_leader{role=~"master|control-plane", cluster_type="management_cluster"} == 0 + expr: etcd_server_has_leader{role=~"master|control-plane", cluster_type="management_cluster", provider!~"eks"} == 0 for: 5m labels: area: kaas @@ -61,7 +61,7 @@ spec: annotations: description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}' opsrecipe: etcd-metrics-missing/ - expr: count(up{cluster_type="management_cluster"}) by (cluster_id) unless count(etcd_server_id) by (cluster_id) + expr: count(up{cluster_type="management_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{provider!~"eks"}) by (cluster_id) for: 1h labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml index 127cd03d0..882221e39 100644 --- a/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml @@ -15,7 +15,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) on workload cluster {{ $labels.cluster_id }} is down.`}}' opsrecipe: etcd-down/ - expr: up{cluster_type="workload_cluster", app="etcd"} == 0 + expr: up{cluster_type="workload_cluster", app="etcd", provider!~"eks"} == 0 for: 20m labels: area: kaas @@ -31,7 +31,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}' opsrecipe: etcd-high-commit-duration/ - expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster"}[5m])) > 1.0 + expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster", provider!~"eks"}[5m])) > 1.0 for: 15m labels: area: kaas @@ -43,7 +43,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}' opsrecipe: etcd-db-size-too-large/ - expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster"}) * 100 > 80 + expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster", provider!~"eks"}) * 100 > 80 for: 15m labels: area: kaas @@ -54,7 +54,7 @@ spec: - alert: WorkloadClusterEtcdNumberOfLeaderChangesTooHigh annotations: description: '{{`Etcd has too many leader changes.`}}' - expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster"}[1h]) > 8 + expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster", provider!~"eks"}[1h]) > 8 labels: area: kaas severity: notify @@ -64,7 +64,7 @@ spec: annotations: description: '{{`Etcd has no leader.`}}' opsrecipe: etcd-has-no-leader/ - expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail"} == 0 + expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail", provider!~"eks"} == 0 for: 35m labels: area: kaas @@ -76,7 +76,7 @@ spec: annotations: description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}' opsrecipe: etcd-metrics-missing/ - expr: count(up{cluster_type="workload_cluster"}) by (cluster_id) unless count(etcd_server_id) by (cluster_id) + expr: count(up{cluster_type="workload_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{provider!~"eks"}) by (cluster_id) for: 1h labels: area: kaas