Add shared irsa alert between capa and vintage

giantswarm · Jun 12, 2024 · 8607e1c · 8607e1c
1 parent 8580020
commit 8607e1c
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 55 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - Prefix all vintage alerts with `vintage` to facilitate maintenance.
   - Merge `kiam` and `inhibit.kiam` into one file.
   - Support any AWS WC in the aws-load-balancer-controller alerts.
+  - Create a shared IRSA alerts rule file to avoid duplication between capa and vintage aws.
 - Review and fix cabbage alerts for multi-provider MCs.
 
 ### Fixed

diff --git a/.../prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml b/.../prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml
@@ -62,18 +62,4 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
-    - alert: IRSATooManyErrors
-      annotations:
-        description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
-        opsrecipe: irsa-operator-error/
-        dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers
-      expr: irsa_operator_cluster_errors > 0
-      for: 10m
-      labels:
-        area: kaas
-        cancel_if_kube_state_metrics_down: "true"
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: phoenix
-        topic: aws
 {{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml
@@ -0,0 +1,47 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+{{- if not .Values.mimir.enabled }}
+    cluster_type: "management_cluster"
+{{- end }}
+  name: irsa.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: irsa-operator
+    rules:
+    - alert: IRSATooManyErrors
+      annotations:
+        description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
+        opsrecipe: irsa-operator-error/
+        ## TODO Remove this if all vintage installations are gone
+        {{- if eq .Values.managementCluster.provider.flavor "capi" }}
+        dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers
+        {{- end }}
+      expr: irsa_operator_cluster_errors{cluster_type="management_cluster"} > 0
+      for: 10m
+      labels:
+        area: kaas
+        cancel_if_kube_state_metrics_down: "true"
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: phoenix
+        topic: aws
+    - alert: IRSAACMCertificateExpiringInLessThan60Days
+      annotations:
+        description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}'
+        opsrecipe: irsa-acm-certificate-expiring/
+      expr: min(irsa_operator_acm_certificate_not_after{cluster_type="management_cluster"}) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000
+      for: 10m
+      labels:
+        area: kaas
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: phoenix
+        topic: aws
diff --git a/...heus-rules/templates/kaas/phoenix/alerting-rules/vintage.aws.management-cluster.rules.yml b/...heus-rules/templates/kaas/phoenix/alerting-rules/vintage.aws.management-cluster.rules.yml
@@ -162,45 +162,4 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
-    - alert: IRSATooManyErrors
-      annotations:
-        description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
-        opsrecipe: irsa-operator-error/
-      expr: irsa_operator_cluster_errors > 0
-      for: 10m
-      labels:
-        area: kaas
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: phoenix
-        topic: aws
-    - alert: IRSAACMCertificateExpiringInLessThan60Days
-      annotations:
-        description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}'
-        opsrecipe: irsa-acm-certificate-expiring/
-      expr: min(irsa_operator_acm_certificate_not_after) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000
-      for: 10m
-      labels:
-        area: kaas
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: phoenix
-        topic: aws
-  - name: aws-jobs
-    rules:
-    - alert: JobHasNotBeenScheduledForTooLong
-      annotations:
-        description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 2 hours.`}}'
-        opsrecipe: job-has-not-been-scheduled-for-too-long/
-      expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="route53-manager"}) > 7200
-      for: 15m
-      labels:
-        area: kaas
-        severity: page
-        team: phoenix
-        topic: managementcluster
 {{- end }}
diff --git a/test/conf/promtool_ignore b/test/conf/promtool_ignore
@@ -6,6 +6,7 @@ kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml
 kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml
 kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml
 kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml
+kaas/phoenix/alerting-rules/irsa.rules.yml
 kaas/phoenix/alerting-rules/vintage.aws.management-cluster.rules.yml
 kaas/phoenix/alerting-rules/vintage.calico.rules.yml
 kaas/phoenix/alerting-rules/vintage.cluster-service.rules.yml