From 8607e1c16ba52bdc5a0f52ac63f73bdf3e11246c Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Wed, 12 Jun 2024 12:09:05 +0200 Subject: [PATCH] Add shared irsa alert between capa and vintage --- CHANGELOG.md | 1 + .../capa.management-cluster.rules.yml | 14 ------ .../phoenix/alerting-rules/irsa.rules.yml | 47 +++++++++++++++++++ .../vintage.aws.management-cluster.rules.yml | 41 ---------------- test/conf/promtool_ignore | 1 + 5 files changed, 49 insertions(+), 55 deletions(-) create mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e22d3822..ac75e5c72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Prefix all vintage alerts with `vintage` to facilitate maintenance. - Merge `kiam` and `inhibit.kiam` into one file. - Support any AWS WC in the aws-load-balancer-controller alerts. + - Create a shared IRSA alerts rule file to avoid duplication between capa and vintage aws. - Review and fix cabbage alerts for multi-provider MCs. ### Fixed diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml index 088624c81..b11cca997 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml @@ -62,18 +62,4 @@ spec: severity: page team: phoenix topic: kubernetes - - alert: IRSATooManyErrors - annotations: - description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}' - opsrecipe: irsa-operator-error/ - dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers - expr: irsa_operator_cluster_errors > 0 - for: 10m - labels: - area: kaas - cancel_if_kube_state_metrics_down: "true" - cancel_if_outside_working_hours: "true" - severity: page - team: phoenix - topic: aws {{- end }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml new file mode 100644 index 000000000..fc719f2f5 --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml @@ -0,0 +1,47 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "labels.common" . | nindent 4 }} +{{- if not .Values.mimir.enabled }} + cluster_type: "management_cluster" +{{- end }} + name: irsa.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: irsa-operator + rules: + - alert: IRSATooManyErrors + annotations: + description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}' + opsrecipe: irsa-operator-error/ + ## TODO Remove this if all vintage installations are gone + {{- if eq .Values.managementCluster.provider.flavor "capi" }} + dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers + {{- end }} + expr: irsa_operator_cluster_errors{cluster_type="management_cluster"} > 0 + for: 10m + labels: + area: kaas + cancel_if_kube_state_metrics_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: phoenix + topic: aws + - alert: IRSAACMCertificateExpiringInLessThan60Days + annotations: + description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}' + opsrecipe: irsa-acm-certificate-expiring/ + expr: min(irsa_operator_acm_certificate_not_after{cluster_type="management_cluster"}) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000 + for: 10m + labels: + area: kaas + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: phoenix + topic: aws diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/vintage.aws.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/vintage.aws.management-cluster.rules.yml index 17c3284d7..57b19c211 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/vintage.aws.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/vintage.aws.management-cluster.rules.yml @@ -162,45 +162,4 @@ spec: severity: page team: phoenix topic: kubernetes - - alert: IRSATooManyErrors - annotations: - description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}' - opsrecipe: irsa-operator-error/ - expr: irsa_operator_cluster_errors > 0 - for: 10m - labels: - area: kaas - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - severity: page - team: phoenix - topic: aws - - alert: IRSAACMCertificateExpiringInLessThan60Days - annotations: - description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}' - opsrecipe: irsa-acm-certificate-expiring/ - expr: min(irsa_operator_acm_certificate_not_after) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000 - for: 10m - labels: - area: kaas - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - severity: page - team: phoenix - topic: aws - - name: aws-jobs - rules: - - alert: JobHasNotBeenScheduledForTooLong - annotations: - description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 2 hours.`}}' - opsrecipe: job-has-not-been-scheduled-for-too-long/ - expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="route53-manager"}) > 7200 - for: 15m - labels: - area: kaas - severity: page - team: phoenix - topic: managementcluster {{- end }} diff --git a/test/conf/promtool_ignore b/test/conf/promtool_ignore index ba4423929..b64b2a342 100644 --- a/test/conf/promtool_ignore +++ b/test/conf/promtool_ignore @@ -6,6 +6,7 @@ kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml +kaas/phoenix/alerting-rules/irsa.rules.yml kaas/phoenix/alerting-rules/vintage.aws.management-cluster.rules.yml kaas/phoenix/alerting-rules/vintage.calico.rules.yml kaas/phoenix/alerting-rules/vintage.cluster-service.rules.yml