Skip to content

Commit

Permalink
Add shared irsa alert between capa and vintage
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson committed Jun 12, 2024
1 parent 8580020 commit 8607e1c
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 55 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Prefix all vintage alerts with `vintage` to facilitate maintenance.
- Merge `kiam` and `inhibit.kiam` into one file.
- Support any AWS WC in the aws-load-balancer-controller alerts.
- Create a shared IRSA alerts rule file to avoid duplication between capa and vintage aws.
- Review and fix cabbage alerts for multi-provider MCs.

### Fixed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,18 +62,4 @@ spec:
severity: page
team: phoenix
topic: kubernetes
- alert: IRSATooManyErrors
annotations:
description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
opsrecipe: irsa-operator-error/
dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers
expr: irsa_operator_cluster_errors > 0
for: 10m
labels:
area: kaas
cancel_if_kube_state_metrics_down: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
topic: aws
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
{{- include "labels.common" . | nindent 4 }}
{{- if not .Values.mimir.enabled }}
cluster_type: "management_cluster"
{{- end }}
name: irsa.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: irsa-operator
rules:
- alert: IRSATooManyErrors
annotations:
description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
opsrecipe: irsa-operator-error/
## TODO Remove this if all vintage installations are gone
{{- if eq .Values.managementCluster.provider.flavor "capi" }}
dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers
{{- end }}
expr: irsa_operator_cluster_errors{cluster_type="management_cluster"} > 0
for: 10m
labels:
area: kaas
cancel_if_kube_state_metrics_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
topic: aws
- alert: IRSAACMCertificateExpiringInLessThan60Days
annotations:
description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}'
opsrecipe: irsa-acm-certificate-expiring/
expr: min(irsa_operator_acm_certificate_not_after{cluster_type="management_cluster"}) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000
for: 10m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
topic: aws
Original file line number Diff line number Diff line change
Expand Up @@ -162,45 +162,4 @@ spec:
severity: page
team: phoenix
topic: kubernetes
- alert: IRSATooManyErrors
annotations:
description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
opsrecipe: irsa-operator-error/
expr: irsa_operator_cluster_errors > 0
for: 10m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
topic: aws
- alert: IRSAACMCertificateExpiringInLessThan60Days
annotations:
description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}'
opsrecipe: irsa-acm-certificate-expiring/
expr: min(irsa_operator_acm_certificate_not_after) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000
for: 10m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
topic: aws
- name: aws-jobs
rules:
- alert: JobHasNotBeenScheduledForTooLong
annotations:
description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 2 hours.`}}'
opsrecipe: job-has-not-been-scheduled-for-too-long/
expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="route53-manager"}) > 7200
for: 15m
labels:
area: kaas
severity: page
team: phoenix
topic: managementcluster
{{- end }}
1 change: 1 addition & 0 deletions test/conf/promtool_ignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml
kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml
kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml
kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml
kaas/phoenix/alerting-rules/irsa.rules.yml
kaas/phoenix/alerting-rules/vintage.aws.management-cluster.rules.yml
kaas/phoenix/alerting-rules/vintage.calico.rules.yml
kaas/phoenix/alerting-rules/vintage.cluster-service.rules.yml
Expand Down

0 comments on commit 8607e1c

Please sign in to comment.