Skip to content

Commit

Permalink
Prepare alerts for mimir migration (#1060)
Browse files Browse the repository at this point in the history
* Prepare alerts for mimir migration

* Reduce grouping to cluster-id only

* Update helm/prometheus-rules/templates/alerting-rules/dex.rules.yml

Co-authored-by: Hervé Nicol <[email protected]>

* Update helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml

Co-authored-by: Hervé Nicol <[email protected]>

* add missing cluster_ids

* add missing cluster_ids again

* add missing cluster_ids again

* Fix KSM and prometheus-agent alerts

* Fix failing tests

---------

Co-authored-by: Hervé Nicol <[email protected]>
  • Loading branch information
QuentinBisson and hervenicol authored Mar 11, 2024
1 parent a81bd4c commit 4da1fd3
Show file tree
Hide file tree
Showing 37 changed files with 369 additions and 93 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
annotations:
description: '{{`Kubernetes API Server {{ $labels.verb }} request latency is too high.`}}'
opsrecipe: apiserver-overloaded/
expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="management_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (verb, le)) > 1
expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="management_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (cluster_id, verb, le)) > 1
for: 1h
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ spec:
annotations:
description: '{{`Kubernetes API Server {{ $labels.verb }} request latency is too high.`}}'
opsrecipe: apiserver-overloaded/
expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (verb, le)) > 1
expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="workload_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (cluster_id, verb, le)) > 1
for: 1h
labels:
area: kaas
Expand All @@ -35,7 +35,7 @@ spec:
annotations:
description: '{{`Kubernetes API Server {{ $labels.cluster_id }} having admission webhook errors.`}}'
opsrecipe: apiserver-admission-webhook-errors/
expr: rate(apiserver_admission_webhook_rejection_count{error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]) > 0
expr: rate(apiserver_admission_webhook_rejection_count{cluster_type="workload_cluster", error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]) > 0
for: 5m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand Down Expand Up @@ -37,3 +38,4 @@ spec:
severity: notify
team: {{ include "providerTeam" . }}
topic: kubernetes
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
If memory usage value is equal to memory limit value then it is likely the pod will be evicted.
If no limits are set then the pod will burst.
`}}
expr: sum by (pod, namespace, container) (container_memory_working_set_bytes{container=~"(cert-manager)"}) / sum by (pod, namespace, container) (kube_pod_container_resource_requests_memory_bytes{container=~"(cert-manager)"}) >= 0.85
expr: sum by (cluster_id, pod, namespace, container) (container_memory_working_set_bytes{container=~"(cert-manager|cert-manager-app-controller)"}) / sum by (cluster_id, pod, namespace, container) (kube_pod_container_resource_requests{resource="memory", unit="byte",container=~"(cert-manager|cert-manager-app-controller)"}) >= 0.85
for: 10m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ spec:
groups:
- name: certificate.all
rules:
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
- alert: KiamCertificateSecretWillExpireInLessThanTwoWeeks
annotations:
description: '{{`Kiam Certificate stored in Secret {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two weeks.`}}'
Expand All @@ -22,6 +23,7 @@ spec:
severity: page
team: phoenix
topic: cert-manager
{{- end }}
- alert: IRSACertificateSecretWillExpireInLessThanTwoWeeks
annotations:
description: '{{`IRSA Pod Identity Webhook Certificate stored in Secret {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two weeks.`}}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
annotations:
description: '{{`Cannot renew Certificate for Secret {{ $labels.namespace }}/{{ $labels.certificatename }} because it is missing.`}}'
opsrecipe: managed-app-cert-manager/missing-certificate-for-secret/
expr: count(cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename!=""}) by (certificatename,namespace) unless count(label_replace(cert_exporter_certificate_cr_not_after{cluster_type="management_cluster"}, "certificatename", "$1", "name", "(.*)")) by (certificatename,namespace)
expr: count(cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename!=""}) by (cluster_id, certificatename, namespace) unless count(label_replace(cert_exporter_certificate_cr_not_after{cluster_type="management_cluster"}, "certificatename", "$1", "name", "(.*)")) by (cluster_id, certificatename,namespace)
for: 5m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ spec:
annotations:
description: '{{`Cilium BPF map is about to fill up.`}}'
opsrecipe: cilium-bpf-map/
expr: avg(cilium_bpf_map_pressure) by (map_name) * 100 > 80
expr: avg(cilium_bpf_map_pressure) by (cluster_id, map_name) * 100 > 80
for: 15m
labels:
area: kaas
Expand All @@ -26,7 +26,7 @@ spec:
annotations:
description: '{{`Cilium BPF map is about filled up.`}}'
opsrecipe: cilium-bpf-map/
expr: avg(cilium_bpf_map_pressure) by (map_name) * 100 > 95
expr: avg(cilium_bpf_map_pressure) by (cluster_id, map_name) * 100 > 95
for: 15m
labels:
area: kaas
Expand Down
17 changes: 15 additions & 2 deletions helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ spec:
annotations:
description: '{{`CoreDNS Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
opsrecipe: core-dns-deployment-not-satisfied/
expr: sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) / (sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) + sum(kube_deployment_status_replicas_unavailable{deployment=~"coredns.*"}))* 100 < 51
expr: |
sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) by (cluster_id) / (sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) by (cluster_id) + sum(kube_deployment_status_replicas_unavailable{deployment=~"coredns.*"}) by (cluster_id))* 100 < 51
for: 10m
labels:
area: empowerment
Expand All @@ -25,7 +26,19 @@ spec:
team: cabbage
topic: dns
- alert: CoreDNSMaxHPAReplicasReached
expr: kube_hpa_status_current_replicas{hpa="coredns"} == kube_hpa_spec_max_replicas{hpa="coredns"} AND kube_hpa_spec_min_replicas{hpa="coredns"} != kube_hpa_spec_max_replicas{hpa="coredns"}
expr: |
(
# This is using the deprecated HPA metric names https://github.com/kubernetes/kube-state-metrics/commit/eb01334f2d03ebc3ab25cd7b29d0ff28f6ca5ee0
# TODO(@team-cabbage) remove once kube-state-metrics is updated to use the new metric names everywhere
kube_hpa_status_current_replicas{hpa="coredns"} == kube_hpa_spec_max_replicas{hpa="coredns"}
and
kube_hpa_spec_min_replicas{hpa="coredns"} != kube_hpa_spec_max_replicas{hpa="coredns"}
) or (
# This is using the new HPA metric names
kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="coredns"} == kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="coredns"}
and
kube_horizontalpodautoscaler_spec_min_replicas{horizontalpodautoscaler="coredns"} != kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="coredns"}
)
for: 120m
labels:
area: empowerment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ spec:
annotations:
description: '{{`Too many tags are not synchronised to registry mirrors.`}}'
opsrecipe: crsync-too-many-tags-missing/
expr: crsync_sync_tags_total{registry="quay.io"} - on (repository,app) group_left sum by(repository,app) (crsync_sync_tags_total{registry!="quay.io"}) > 0
expr: crsync_sync_tags_total{registry="quay.io"} - on (cluster_id, repository, app) group_left sum by(cluster_id, repository, app) (crsync_sync_tags_total{registry!="quay.io"}) > 0
for: 1h
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,7 @@ spec:
annotations:
description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
opsrecipe: deployment-not-satisfied/
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*", cluster_id!~"argali|giraffe"} > 0
for: 30m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
severity: page
team: atlas
topic: managementcluster
- alert: DeploymentNotSatisfiedChinaAtlas
annotations:
description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
opsrecipe: deployment-not-satisfied/
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*", cluster_id=~"argali|giraffe"} > 0
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*"} > 0
for: 30m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ spec:
annotations:
description: '{{`dex-operator failed to renew secret of {{ $labels.app_registration_name }} for {{ $labels.app_owner }} on provider {{ $labels.provider_type }}.`}}'
opsrecipe: dex-operator/
expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster"}) - time() < 60*60*12
expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation, cluster_id) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster"}) - time() < 60*60*12
for: 30m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
opsrecipe: dns-operator-azure/
expr: |-
capi_cluster_status_phase{phase="Provisioned"}
unless on (name)
unless on (cluster_id, name)
label_replace(dns_operator_azure_zone_info{type="public"}, "name", "$1", "resource_group", "(.+)")
for: 30m
labels:
Expand All @@ -31,7 +31,7 @@ spec:
{{`Error rate for {{ $labels.method }} is high. Check dns-operator-azure logs in installation/{{ $labels.installation }}.`}}
opsrecipe: dns-operator-azure/
expr: |-
sum by (method,installation) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0
sum by (cluster_id, method, installation) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0
for: 15m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ spec:
annotations:
description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: there are too many API requests for flow schema {{ $labels.flow_schema }}.`}}'
opsrecipe: flowcontrol-rejected-requests/
expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (priority_level) > (min by(priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(priority_level) (apiserver_flowcontrol_request_concurrency_limit))
expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, priority_level) > (min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit))
for: 15m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ spec:
annotations:
description: '{{`Fluentbit is down on node ({{ $labels.node }}).`}}'
opsrecipe: fluentbit-down/
expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, cluster_type, installation, job, namespace, provider, node) == 0
expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, job, namespace, node) == 0
for: 15m
labels:
area: empowerment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ spec:
cancel_if_outside_working_hours: "true"
team: honeybadger
topic: releng
# this alert checks average reconciliation times in 10 min windows, then calculates monthly error budget usage for it
# this alert checks average reconciliation times in 10 min windows, then calculates monthly error budget usage for it
- alert: FluxReconciliationLongErrorBudgetLow
annotations:
description: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,12 @@ spec:
- alert: InhibitionClusterWithoutWorkerNodes
annotations:
description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}'
{{- if eq .Values.managementCluster.provider.kind "aws" }}
expr: sum(aws_operator_asg_desired_count) by (cluster_id) - on(cluster_id) sum(aws_operator_asg_desired_count{asg=~".*-tccpn-.*"}) by (cluster_id) == 0
{{- end }}
labels:
area: kaas
has_worker_nodes: "false"
team: phoenix
topic: status
{{- end }}
{{- if eq .Values.managementCluster.provider.kind "aws" }}
- alert: InhibitionKiamErrors
annotations:
description: '{{`Kiam on cluster {{ $labels.cluster_id }} has increased error rate.`}}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ spec:
cluster_status_deleting: "true"
team: phoenix
topic: status
{{- if eq .Values.managementCluster.provider.kind "aws" }}
- alert: InhibitionClusterWithNoNodePools
annotations:
description: '{{`Cluster {{ $labels.cluster_id }} doesn''t have any node pools.`}}'
Expand Down Expand Up @@ -96,5 +95,4 @@ spec:
instance_state_not_running: "true"
team: phoenix
topic: status
{{- end }}
{{- end }}
Loading

0 comments on commit 4da1fd3

Please sign in to comment.