-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
review phoenix alerts #1211
review phoenix alerts #1211
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,25 @@ | ||
{{- if eq .Values.managementCluster.provider.kind "aws" }} | ||
# This rule applies to vintage aws and capa workload clusters | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PrometheusRule | ||
metadata: | ||
creationTimestamp: null | ||
labels: | ||
{{- include "labels.common" . | nindent 4 }} | ||
# No need for .Values.mimir.enabled condition - will be gone with Vintage | ||
{{- if not .Values.mimir.enabled }} | ||
cluster_type: "workload_cluster" | ||
{{- end }} | ||
name: aws.workload-cluster.rules | ||
namespace: {{ .Values.namespace }} | ||
spec: | ||
groups: | ||
- name: aws | ||
- name: aws.workload-cluster | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just making sure we're not replacing another rule group |
||
rules: | ||
- alert: WorkloadClusterContainerIsRestartingTooFrequentlyAWS | ||
annotations: | ||
description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}' | ||
opsrecipe: container-is-restarting-too-often/ | ||
expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-plugin.*|aws-pod-identity-webhook.*|etcd-kubernetes-resources-count-exporter.*"}[1h]),"service","/","namespace","pod") > 10 | ||
## TODO Review this list once all vintage installations are gone | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved cluster-autoscaler and etcd-kubernetes-resources-count-exporter.* to turtles |
||
expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|ebs-(plugin|csi).*|aws-pod-identity-webhook.*"}[1h]), "service", "/", "namespace", "pod") > 10 | ||
for: 10m | ||
labels: | ||
area: kaas | ||
|
@@ -29,68 +31,42 @@ spec: | |
severity: page | ||
team: phoenix | ||
topic: kubernetes | ||
- alert: WorkloadClusterCriticalPodNotRunningAWS | ||
- alert: WorkloadClusterPodPendingAWS | ||
annotations: | ||
description: '{{`Critical pod {{ $labels.namespace }}/{{ $labels.pod }} is not running.`}}' | ||
opsrecipe: critical-pod-is-not-running/ | ||
expr: kube_pod_container_status_running{namespace="kube-system",container=~"(k8s-api-server|k8s-controller-manager|k8s-scheduler)"} != 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-api-server"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-controller-manager"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-scheduler"}), "pod", "$1", "container", "(.+)") == 1 | ||
for: 20m | ||
description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}' | ||
opsrecipe: pod-stuck-in-pending/ | ||
## TODO Review this list once all vintage installations are gone | ||
expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|ebs-(plugin|csi).*)", phase="Pending"} == 1 | ||
for: 15m | ||
labels: | ||
area: kaas | ||
cancel_if_outside_working_hours: "true" | ||
cancel_if_cluster_status_creating: "true" | ||
cancel_if_cluster_status_deleting: "true" | ||
cancel_if_cluster_status_updating: "true" | ||
cancel_if_kube_state_metrics_down: "true" | ||
cancel_if_cluster_has_no_workers: "true" | ||
severity: page | ||
team: phoenix | ||
topic: kubernetes | ||
- alert: WorkloadClusterControlPlaneNodeMissingAWS | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved to turtles |
||
annotations: | ||
description: '{{`Control plane node is missing.`}}' | ||
opsrecipe: master-node-missing/ | ||
expr: count by (cluster_id) (kubernetes_build_info{app="kubelet"} unless on (node) kube_node_role{role!~"control-plane|master"}) == 0 | ||
for: 30m | ||
labels: | ||
area: kaas | ||
cancel_if_cluster_status_creating: "true" | ||
cancel_if_cluster_status_deleting: "true" | ||
control_plane_node_down: "true" | ||
severity: page | ||
team: phoenix | ||
topic: kubernetes | ||
- alert: WorkloadClusterHAControlPlaneDownForTooLong | ||
annotations: | ||
description: '{{`Control plane node in HA setup is down for a long time.`}}' | ||
opsrecipe: master-node-missing/ | ||
expr: sum by (cluster_id) (kubernetes_build_info{app="kubelet"} * on (node) kube_node_role{role="control-plane"}) == 2 or sum by (cluster_id) (kubernetes_build_info{app="kubelet"} * on (node) kube_node_role{role="master"}) == 2 | ||
for: 30m | ||
labels: | ||
area: kaas | ||
cancel_if_cluster_status_creating: "true" | ||
cancel_if_cluster_status_deleting: "true" | ||
cancel_if_outside_working_hours: "true" | ||
control_plane_node_down: "true" | ||
severity: page | ||
team: phoenix | ||
topic: kubernetes | ||
- alert: WorkloadClusterPodPendingAWS | ||
{{- if eq .Values.managementCluster.provider.kind "aws" }} | ||
## TODO Remove when all vintage installations are gone | ||
- alert: WorkloadClusterCriticalPodNotRunningAWS | ||
annotations: | ||
description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}' | ||
opsrecipe: pod-stuck-in-pending/ | ||
expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-csi-.*)",phase="Pending"} == 1 | ||
for: 15m | ||
description: '{{`Critical pod {{ $labels.namespace }}/{{ $labels.pod }} is not running.`}}' | ||
opsrecipe: critical-pod-is-not-running/ | ||
expr: kube_pod_container_status_running{namespace="kube-system", container=~"(k8s-api-server|k8s-controller-manager|k8s-scheduler)"} != 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system", container="k8s-api-server"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system", container="k8s-controller-manager"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system", container="k8s-scheduler"}), "pod", "$1", "container", "(.+)") == 1 | ||
for: 20m | ||
labels: | ||
area: kaas | ||
cancel_if_outside_working_hours: "true" | ||
cancel_if_cluster_status_creating: "true" | ||
cancel_if_cluster_status_deleting: "true" | ||
cancel_if_cluster_status_updating: "true" | ||
cancel_if_kube_state_metrics_down: "true" | ||
cancel_if_cluster_has_no_workers: "true" | ||
severity: page | ||
team: phoenix | ||
topic: kubernetes | ||
- alert: WorkloadClusterAWSCNIIpAlmostExhausted | ||
annotations: | ||
description: '{{`IPs exhausted for aws-cni subnet {{ $labels.id }} in AZ {{ $labels.availabvility_zone }}.`}}' | ||
description: '{{`IPs exhausted for aws-cni subnet {{ $labels.id }} in AZ {{ $labels.availability_zone }}.`}}' | ||
opsrecipe: aws-ips-exhausted/ | ||
expr: min(aws_operator_subnet_available_ips_percentage{subnet_type="aws-cni"}) by (account, availability_zone, cluster_id, id) < 0.1 | ||
for: 5m | ||
|
@@ -111,4 +87,4 @@ spec: | |
severity: page | ||
team: phoenix | ||
topic: workloadcluster | ||
{{- end }} | ||
{{- end }} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
{{- if eq .Values.managementCluster.provider.kind "capa" }} | ||
# This rule applies to capa management clusters only | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PrometheusRule | ||
metadata: | ||
|
@@ -12,13 +13,13 @@ metadata: | |
namespace: {{ .Values.namespace }} | ||
spec: | ||
groups: | ||
- name: capa | ||
- name: capa.management-cluster | ||
rules: | ||
- alert: ManagementClusterPodPendingCAPA | ||
annotations: | ||
description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}' | ||
opsrecipe: pod-stuck-in-pending/ | ||
expr: kube_pod_status_phase{namespace="giantswarm", pod=~"(aws.*|capa.*|irsa-operator.*)",phase="Pending", cluster_type="management_cluster"} == 1 | ||
expr: kube_pod_status_phase{namespace="giantswarm", provider="capa", pod=~"(aws.*|capa.*|irsa-operator.*)", phase="Pending", cluster_type="management_cluster"} == 1 | ||
for: 15m | ||
labels: | ||
area: kaas | ||
|
@@ -48,11 +49,11 @@ spec: | |
description: '{{`Deployment {{ $labels.deployment }} is missing.`}}' | ||
opsrecipe: management-cluster-deployment-is-missing/ | ||
expr: | | ||
absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-resolver-rules-operator", cluster_type="management_cluster"}) | ||
or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-vpc-operator", cluster_type="management_cluster"}) | ||
or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="capa-controller-manager", cluster_type="management_cluster"}) | ||
or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="capa-iam-operator", cluster_type="management_cluster"}) | ||
or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="irsa-operator", cluster_type="management_cluster"}) | ||
absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-resolver-rules-operator", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Those label ensure we can use absent safely with mimir |
||
or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-vpc-operator", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) | ||
or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="capa-controller-manager", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) | ||
or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="capa-iam-operator", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) | ||
or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="irsa-operator", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) | ||
for: 15m | ||
labels: | ||
area: kaas | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,3 @@ | ||
{{- if (eq .Values.managementCluster.provider.kind "capz") }} | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PrometheusRule | ||
metadata: | ||
|
@@ -23,20 +22,19 @@ spec: | |
area: kaas | ||
cancel_if_outside_working_hours: {{include "workingHoursOnly" .}} | ||
severity: notify | ||
team: {{include "providerTeam" .}} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @giantswarm/team-phoenix this operator can only run on a capz MC right? |
||
team: phoenix | ||
topic: managementcluster | ||
- alert: AzureDNSOperatorAPIErrorRate | ||
annotations: | ||
description: |- | ||
{{`Error rate for {{ $labels.method }} is high. Check dns-operator-azure logs in installation/{{ $labels.installation }}.`}} | ||
opsrecipe: dns-operator-azure/ | ||
expr: |- | ||
sum by (cluster_id, method, installation) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0 | ||
sum by (cluster_id, installation, method, pipeline, provider) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0 | ||
for: 15m | ||
labels: | ||
area: kaas | ||
cancel_if_outside_working_hours: {{include "workingHoursOnly" .}} | ||
severity: notify | ||
team: {{include "providerTeam" .}} | ||
team: phoenix | ||
topic: managementcluster | ||
{{- end }} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PrometheusRule | ||
metadata: | ||
creationTimestamp: null | ||
labels: | ||
{{- include "labels.common" . | nindent 4 }} | ||
{{- if not .Values.mimir.enabled }} | ||
cluster_type: "workload_cluster" | ||
{{- end }} | ||
name: pods.core.rules | ||
namespace: {{ .Values.namespace }} | ||
spec: | ||
groups: | ||
- name: pods.core | ||
rules: | ||
- alert: ContainerIsRestartingTooFrequently | ||
annotations: | ||
description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}' | ||
opsrecipe: container-is-restarting-too-often/ | ||
expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"cluster-autoscaler.*|etcd-kubernetes-resources-count-exporter.*"}[1h]), "service", "/", "namespace", "pod") > 10 | ||
for: 10m | ||
labels: | ||
area: kaas | ||
cancel_if_cluster_status_creating: "true" | ||
cancel_if_cluster_status_deleting: "true" | ||
cancel_if_cluster_status_updating: "true" | ||
cancel_if_outside_working_hours: "true" | ||
cancel_if_cluster_has_no_workers: "true" | ||
severity: page | ||
team: {{ include "providerTeam" . }} | ||
topic: kubernetes | ||
- alert: PodPending | ||
annotations: | ||
description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}' | ||
opsrecipe: pod-stuck-in-pending/ | ||
expr: kube_pod_status_phase{namespace="kube-system",pod=~"(cluster-autoscaler.*)",phase="Pending"} == 1 | ||
for: 15m | ||
labels: | ||
area: kaas | ||
cancel_if_outside_working_hours: "true" | ||
cancel_if_cluster_status_creating: "true" | ||
cancel_if_cluster_status_deleting: "true" | ||
cancel_if_cluster_status_updating: "true" | ||
cancel_if_kube_state_metrics_down: "true" | ||
cancel_if_cluster_has_no_workers: "true" | ||
severity: page | ||
team: {{ include "providerTeam" . }} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Impaired nodes can happen on capa/eks