diff --git a/CHANGELOG.md b/CHANGELOG.md index a055bedaf..9d38c68f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,12 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add new mimir.enabled property to disable the MC/WC split in alerts. +- Add new alert for reconciling errors of `AWS load balancer controller`. ### Changed - Change ownership of `CadvisorDown` to Turtles/Phoenix. - Review alerting prior to Mimir migration. - Increase duration for fluentbit rules to avoid false alerts when a new release is deployed. +- Improve `AWS load balancer controller` alert for failed AWS calls query. ### Removed diff --git a/helm/prometheus-rules/templates/alerting-rules/aws-load-balancer-controller.rules.yml b/helm/prometheus-rules/templates/alerting-rules/aws-load-balancer-controller.rules.yml index 336c74c6a..8774803f1 100644 --- a/helm/prometheus-rules/templates/alerting-rules/aws-load-balancer-controller.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/aws-load-balancer-controller.rules.yml @@ -14,11 +14,11 @@ spec: groups: - name: aws-load-balancer-controller rules: - - alert: AWSLoadBalancerAssumeRoleErrors + - alert: AWSLoadBalancerControllerAWSAPIErrors annotations: - description: '{{`AWS load balancer pod {{ $labels.namespace}}/{{ $labels.pod_name }} on {{ $labels.cluster_id}}/{{ $labels.cluster }} can not assume the role.`}}' - opsrecipe: alb-role-errors#assume-role-errors - expr: increase(aws_api_calls_total{error_code="WebIdentityErr"}[20m]) > 0 + description: '{{`AWS load balancer controller pod {{ $labels.namespace}}/{{ $labels.pod }} on {{ $labels.cluster_id}} is throwing {{ $labels.error_code }} errors when contacting AWS API.`}}' + opsrecipe: alb-errors + expr: sum(increase(aws_api_calls_total{error_code != ""}[20m])) by (error_code,namespace,pod,cluster_id) > 0 for: 40m labels: area: managedservices @@ -29,11 +29,11 @@ spec: severity: page team: phoenix topic: alb - - alert: AWSLoadBalancerRolePolicyErrors + - alert: AWSLoadBalancerControllerReconcileErrors annotations: - description: '{{`AWS load balancer pod {{ $labels.namespace}}/{{ $labels.pod_name }} on {{ $labels.cluster_id}}/{{ $labels.cluster }} has a wrong role policy.`}}' - opsrecipe: alb-role-errors#role-policy-errors - expr: increase(aws_api_calls_total{error_code="UnauthorizedOperation"}[20m]) > 0 + description: '{{`AWS load balancer controller pod {{ $labels.namespace }}/{{ $labels.pod }} on {{ $labels.cluster_id }} is throwing errors while reconciling the {{ $labels.controller }} controller.`}}' + opsrecipe: alb-errors + expr: sum(increase(controller_runtime_reconcile_total{result = "error"}[20m])) by (controller,namespace,pod,cluster_id) > 0 for: 40m labels: area: managedservices