From 4da1fd343f5e2c77f9ba6b91b57233634103320b Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Mon, 11 Mar 2024 15:47:46 +0100 Subject: [PATCH] Prepare alerts for mimir migration (#1060) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Prepare alerts for mimir migration * Reduce grouping to cluster-id only * Update helm/prometheus-rules/templates/alerting-rules/dex.rules.yml Co-authored-by: Hervé Nicol * Update helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml Co-authored-by: Hervé Nicol * add missing cluster_ids * add missing cluster_ids again * add missing cluster_ids again * Fix KSM and prometheus-agent alerts * Fix failing tests --------- Co-authored-by: Hervé Nicol --- .../apiserver.management-cluster.rules.yml | 2 +- .../apiserver.workload-cluster.rules.yml | 4 +- .../templates/alerting-rules/calico.rules.yml | 2 + .../alerting-rules/cert-manager.rules.yml | 2 +- .../alerting-rules/certificate.all.rules.yml | 2 + .../certificate.management-cluster.rules.yml | 2 +- .../templates/alerting-rules/cilium.rules.yml | 4 +- .../alerting-rules/coredns.rules.yml | 17 +- .../templates/alerting-rules/crsync.rules.yml | 2 +- .../deployment.management-cluster.rules.yml | 17 +- .../templates/alerting-rules/dex.rules.yml | 2 +- .../dns-operator-azure.rules.yml | 4 +- .../alerting-rules/fairness.rules.yml | 2 +- .../alerting-rules/fluentbit.rules.yml | 2 +- .../templates/alerting-rules/flux.rules.yml | 2 +- .../alerting-rules/inhibit.all.rules.yml | 4 - .../inhibit.management-cluster.rules.yml | 2 - .../kube-state-metrics.rules.yml | 153 +++++++++++++++++- .../alerting-rules/kyverno.all.rules.yml | 6 +- .../alerting-rules/loki.all.rules.yml | 6 +- .../management-cluster.rules.yml | 4 +- .../templates/alerting-rules/mimir.rules.yml | 2 +- .../node.management_cluster.rules.yml | 4 +- .../node.workload_cluster.rules.yml | 14 +- .../alerting-rules/operatorkit.rules.yml | 4 +- .../alerting-rules/prometheus-agent.rules.yml | 62 ++++--- .../prometheus-operator.rules.yml | 4 +- .../alerting-rules/promtail.rules.yml | 4 +- .../alerting-rules/service-level.rules.yml | 8 +- .../alerting-rules/silence-operator.rules.yml | 2 +- .../templates/alerting-rules/sloth.rules.yml | 2 +- .../global/kyverno.all.rules.test.yml | 9 +- .../providers/global/loki.all.rules.test.yml | 3 +- .../providers/global/mimir.rules.test.yml | 1 + .../global/prometheus-agent.rules.test.yml | 96 +++++++++++ .../providers/global/promtail.rules.test.yml | 5 + .../providers/global/sloth.rules.test.yml | 1 + 37 files changed, 369 insertions(+), 93 deletions(-) diff --git a/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml index 8951cc721..f60a3f8d7 100644 --- a/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml @@ -20,7 +20,7 @@ spec: annotations: description: '{{`Kubernetes API Server {{ $labels.verb }} request latency is too high.`}}' opsrecipe: apiserver-overloaded/ - expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="management_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (verb, le)) > 1 + expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="management_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (cluster_id, verb, le)) > 1 for: 1h labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml index edd96b89a..d4b00066b 100644 --- a/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml @@ -19,7 +19,7 @@ spec: annotations: description: '{{`Kubernetes API Server {{ $labels.verb }} request latency is too high.`}}' opsrecipe: apiserver-overloaded/ - expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (verb, le)) > 1 + expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="workload_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (cluster_id, verb, le)) > 1 for: 1h labels: area: kaas @@ -35,7 +35,7 @@ spec: annotations: description: '{{`Kubernetes API Server {{ $labels.cluster_id }} having admission webhook errors.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: rate(apiserver_admission_webhook_rejection_count{error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]) > 0 + expr: rate(apiserver_admission_webhook_rejection_count{cluster_type="workload_cluster", error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]) > 0 for: 5m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/calico.rules.yml b/helm/prometheus-rules/templates/alerting-rules/calico.rules.yml index a96838e9b..9adbc20e6 100644 --- a/helm/prometheus-rules/templates/alerting-rules/calico.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/calico.rules.yml @@ -1,3 +1,4 @@ +{{- if eq .Values.managementCluster.provider.flavor "vintage" }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -37,3 +38,4 @@ spec: severity: notify team: {{ include "providerTeam" . }} topic: kubernetes +{{- end }} diff --git a/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml b/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml index f7ab66b53..7404af6b5 100644 --- a/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml @@ -17,7 +17,7 @@ spec: If memory usage value is equal to memory limit value then it is likely the pod will be evicted. If no limits are set then the pod will burst. `}} - expr: sum by (pod, namespace, container) (container_memory_working_set_bytes{container=~"(cert-manager)"}) / sum by (pod, namespace, container) (kube_pod_container_resource_requests_memory_bytes{container=~"(cert-manager)"}) >= 0.85 + expr: sum by (cluster_id, pod, namespace, container) (container_memory_working_set_bytes{container=~"(cert-manager|cert-manager-app-controller)"}) / sum by (cluster_id, pod, namespace, container) (kube_pod_container_resource_requests{resource="memory", unit="byte",container=~"(cert-manager|cert-manager-app-controller)"}) >= 0.85 for: 10m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml index c1c497f09..a04fda923 100644 --- a/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml @@ -10,6 +10,7 @@ spec: groups: - name: certificate.all rules: + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} - alert: KiamCertificateSecretWillExpireInLessThanTwoWeeks annotations: description: '{{`Kiam Certificate stored in Secret {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two weeks.`}}' @@ -22,6 +23,7 @@ spec: severity: page team: phoenix topic: cert-manager + {{- end }} - alert: IRSACertificateSecretWillExpireInLessThanTwoWeeks annotations: description: '{{`IRSA Pod Identity Webhook Certificate stored in Secret {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two weeks.`}}' diff --git a/helm/prometheus-rules/templates/alerting-rules/certificate.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/certificate.management-cluster.rules.yml index b37457eab..c83fb2e78 100644 --- a/helm/prometheus-rules/templates/alerting-rules/certificate.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/certificate.management-cluster.rules.yml @@ -17,7 +17,7 @@ spec: annotations: description: '{{`Cannot renew Certificate for Secret {{ $labels.namespace }}/{{ $labels.certificatename }} because it is missing.`}}' opsrecipe: managed-app-cert-manager/missing-certificate-for-secret/ - expr: count(cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename!=""}) by (certificatename,namespace) unless count(label_replace(cert_exporter_certificate_cr_not_after{cluster_type="management_cluster"}, "certificatename", "$1", "name", "(.*)")) by (certificatename,namespace) + expr: count(cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename!=""}) by (cluster_id, certificatename, namespace) unless count(label_replace(cert_exporter_certificate_cr_not_after{cluster_type="management_cluster"}, "certificatename", "$1", "name", "(.*)")) by (cluster_id, certificatename,namespace) for: 5m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml b/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml index 6c1ef0a75..49aced72d 100644 --- a/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml @@ -14,7 +14,7 @@ spec: annotations: description: '{{`Cilium BPF map is about to fill up.`}}' opsrecipe: cilium-bpf-map/ - expr: avg(cilium_bpf_map_pressure) by (map_name) * 100 > 80 + expr: avg(cilium_bpf_map_pressure) by (cluster_id, map_name) * 100 > 80 for: 15m labels: area: kaas @@ -26,7 +26,7 @@ spec: annotations: description: '{{`Cilium BPF map is about filled up.`}}' opsrecipe: cilium-bpf-map/ - expr: avg(cilium_bpf_map_pressure) by (map_name) * 100 > 95 + expr: avg(cilium_bpf_map_pressure) by (cluster_id, map_name) * 100 > 95 for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml b/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml index af5454208..e1ff1cdd2 100644 --- a/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml @@ -14,7 +14,8 @@ spec: annotations: description: '{{`CoreDNS Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: core-dns-deployment-not-satisfied/ - expr: sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) / (sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) + sum(kube_deployment_status_replicas_unavailable{deployment=~"coredns.*"}))* 100 < 51 + expr: | + sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) by (cluster_id) / (sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) by (cluster_id) + sum(kube_deployment_status_replicas_unavailable{deployment=~"coredns.*"}) by (cluster_id))* 100 < 51 for: 10m labels: area: empowerment @@ -25,7 +26,19 @@ spec: team: cabbage topic: dns - alert: CoreDNSMaxHPAReplicasReached - expr: kube_hpa_status_current_replicas{hpa="coredns"} == kube_hpa_spec_max_replicas{hpa="coredns"} AND kube_hpa_spec_min_replicas{hpa="coredns"} != kube_hpa_spec_max_replicas{hpa="coredns"} + expr: | + ( + # This is using the deprecated HPA metric names https://github.com/kubernetes/kube-state-metrics/commit/eb01334f2d03ebc3ab25cd7b29d0ff28f6ca5ee0 + # TODO(@team-cabbage) remove once kube-state-metrics is updated to use the new metric names everywhere + kube_hpa_status_current_replicas{hpa="coredns"} == kube_hpa_spec_max_replicas{hpa="coredns"} + and + kube_hpa_spec_min_replicas{hpa="coredns"} != kube_hpa_spec_max_replicas{hpa="coredns"} + ) or ( + # This is using the new HPA metric names + kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="coredns"} == kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="coredns"} + and + kube_horizontalpodautoscaler_spec_min_replicas{horizontalpodautoscaler="coredns"} != kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="coredns"} + ) for: 120m labels: area: empowerment diff --git a/helm/prometheus-rules/templates/alerting-rules/crsync.rules.yml b/helm/prometheus-rules/templates/alerting-rules/crsync.rules.yml index f4995ef68..d5be1b1e6 100644 --- a/helm/prometheus-rules/templates/alerting-rules/crsync.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/crsync.rules.yml @@ -32,7 +32,7 @@ spec: annotations: description: '{{`Too many tags are not synchronised to registry mirrors.`}}' opsrecipe: crsync-too-many-tags-missing/ - expr: crsync_sync_tags_total{registry="quay.io"} - on (repository,app) group_left sum by(repository,app) (crsync_sync_tags_total{registry!="quay.io"}) > 0 + expr: crsync_sync_tags_total{registry="quay.io"} - on (cluster_id, repository, app) group_left sum by(cluster_id, repository, app) (crsync_sync_tags_total{registry!="quay.io"}) > 0 for: 1h labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml index 3e518f828..b2df1ee3e 100644 --- a/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml @@ -17,22 +17,7 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: deployment-not-satisfied/ - expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*", cluster_id!~"argali|giraffe"} > 0 - for: 30m - labels: - area: kaas - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: page - team: atlas - topic: managementcluster - - alert: DeploymentNotSatisfiedChinaAtlas - annotations: - description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' - opsrecipe: deployment-not-satisfied/ - expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*", cluster_id=~"argali|giraffe"} > 0 + expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*"} > 0 for: 30m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/dex.rules.yml b/helm/prometheus-rules/templates/alerting-rules/dex.rules.yml index db8014999..03e224a89 100644 --- a/helm/prometheus-rules/templates/alerting-rules/dex.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/dex.rules.yml @@ -29,7 +29,7 @@ spec: annotations: description: '{{`dex-operator failed to renew secret of {{ $labels.app_registration_name }} for {{ $labels.app_owner }} on provider {{ $labels.provider_type }}.`}}' opsrecipe: dex-operator/ - expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster"}) - time() < 60*60*12 + expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation, cluster_id) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster"}) - time() < 60*60*12 for: 30m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml b/helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml index 58e15632c..2e8db77c7 100644 --- a/helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml @@ -16,7 +16,7 @@ spec: opsrecipe: dns-operator-azure/ expr: |- capi_cluster_status_phase{phase="Provisioned"} - unless on (name) + unless on (cluster_id, name) label_replace(dns_operator_azure_zone_info{type="public"}, "name", "$1", "resource_group", "(.+)") for: 30m labels: @@ -31,7 +31,7 @@ spec: {{`Error rate for {{ $labels.method }} is high. Check dns-operator-azure logs in installation/{{ $labels.installation }}.`}} opsrecipe: dns-operator-azure/ expr: |- - sum by (method,installation) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0 + sum by (cluster_id, method, installation) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0 for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml index 4200addff..1b29fb629 100644 --- a/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml @@ -25,7 +25,7 @@ spec: annotations: description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: there are too many API requests for flow schema {{ $labels.flow_schema }}.`}}' opsrecipe: flowcontrol-rejected-requests/ - expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (priority_level) > (min by(priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(priority_level) (apiserver_flowcontrol_request_concurrency_limit)) + expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, priority_level) > (min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit)) for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml index c3cdbc932..93b9d51d8 100644 --- a/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml @@ -46,7 +46,7 @@ spec: annotations: description: '{{`Fluentbit is down on node ({{ $labels.node }}).`}}' opsrecipe: fluentbit-down/ - expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, cluster_type, installation, job, namespace, provider, node) == 0 + expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, job, namespace, node) == 0 for: 15m labels: area: empowerment diff --git a/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml b/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml index 2873e7df4..d972b5a06 100644 --- a/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml @@ -107,7 +107,7 @@ spec: cancel_if_outside_working_hours: "true" team: honeybadger topic: releng -# this alert checks average reconciliation times in 10 min windows, then calculates monthly error budget usage for it + # this alert checks average reconciliation times in 10 min windows, then calculates monthly error budget usage for it - alert: FluxReconciliationLongErrorBudgetLow annotations: description: |- diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml index 1c6fdf3a0..318ae2c44 100644 --- a/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml @@ -52,16 +52,12 @@ spec: - alert: InhibitionClusterWithoutWorkerNodes annotations: description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}' - {{- if eq .Values.managementCluster.provider.kind "aws" }} expr: sum(aws_operator_asg_desired_count) by (cluster_id) - on(cluster_id) sum(aws_operator_asg_desired_count{asg=~".*-tccpn-.*"}) by (cluster_id) == 0 - {{- end }} labels: area: kaas has_worker_nodes: "false" team: phoenix topic: status - {{- end }} - {{- if eq .Values.managementCluster.provider.kind "aws" }} - alert: InhibitionKiamErrors annotations: description: '{{`Kiam on cluster {{ $labels.cluster_id }} has increased error rate.`}}' diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml index 59adfed5e..d23d0f156 100644 --- a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml @@ -59,7 +59,6 @@ spec: cluster_status_deleting: "true" team: phoenix topic: status - {{- if eq .Values.managementCluster.provider.kind "aws" }} - alert: InhibitionClusterWithNoNodePools annotations: description: '{{`Cluster {{ $labels.cluster_id }} doesn''t have any node pools.`}}' @@ -96,5 +95,4 @@ spec: instance_state_not_running: "true" team: phoenix topic: status - {{- end }} {{- end }} diff --git a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml index 8b15812f2..88fa3264a 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml @@ -62,7 +62,7 @@ spec: opsrecipe: kube-state-metrics-down/ expr: |- # When it looks up but we don't have metrics - count({app="kube-state-metrics"}) < 10 + count({app="kube-state-metrics"}) by (cluster_id) < 10 for: 20m labels: area: kaas @@ -77,11 +77,27 @@ spec: severity: page team: atlas topic: observability + - alert: KubeConfigMapCreatedMetricMissing annotations: description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ + {{- if not .Values.mimir.enabled }} expr: absent(kube_configmap_created{}) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(kube_configmap_created{}) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 30m labels: area: kaas @@ -94,7 +110,22 @@ spec: annotations: description: '{{`kube_daemonset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ + {{- if not .Values.mimir.enabled }} expr: absent(kube_daemonset_created{}) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(kube_daemonset_created{}) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 30m labels: area: kaas @@ -107,7 +138,22 @@ spec: annotations: description: '{{`kube_deployment_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ + {{- if not .Values.mimir.enabled }} expr: absent(kube_deployment_created{}) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(kube_deployment_created{}) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 30m labels: area: kaas @@ -120,7 +166,22 @@ spec: annotations: description: '{{`kube_endpoint_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ + {{- if not .Values.mimir.enabled }} expr: absent(kube_endpoint_created{}) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(kube_endpoint_created{}) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 30m labels: area: kaas @@ -133,7 +194,22 @@ spec: annotations: description: '{{`kube_namespace_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ + {{- if not .Values.mimir.enabled }} expr: absent(kube_namespace_created{}) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(kube_namespace_created{}) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 30m labels: area: kaas @@ -146,7 +222,22 @@ spec: annotations: description: '{{`kube_node_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ + {{- if not .Values.mimir.enabled }} expr: absent(kube_node_created{}) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(kube_node_created{}) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 30m labels: area: kaas @@ -159,7 +250,22 @@ spec: annotations: description: '{{`kube_pod_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ + {{- if not .Values.mimir.enabled }} expr: absent(kube_pod_created{}) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(kube_pod_created{}) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 30m labels: area: kaas @@ -172,7 +278,22 @@ spec: annotations: description: '{{`kube_replicaset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ + {{- if not .Values.mimir.enabled }} expr: absent(kube_replicaset_created{}) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(kube_replicaset_created{}) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 30m labels: area: kaas @@ -185,7 +306,22 @@ spec: annotations: description: '{{`kube_secret_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ + {{- if not .Values.mimir.enabled }} expr: absent(kube_secret_created{}) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(kube_secret_created{}) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 30m labels: area: kaas @@ -198,7 +334,22 @@ spec: annotations: description: '{{`kube_service_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ + {{- if not .Values.mimir.enabled }} expr: absent(kube_service_created{}) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(kube_secret_created{}) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 30m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml index 9cfab2cbb..7557182c4 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml @@ -14,7 +14,7 @@ spec: annotations: description: 'Kyverno has no available replicas but webhooks are present.' opsrecipe: kyverno-webhooks/ - expr: sum(kube_validatingwebhookconfiguration_info{validatingwebhookconfiguration=~"kyverno-.*"}) > 0 and sum(kube_deployment_status_replicas{deployment=~"kyverno|kyverno-admission-controller"}) == 0 + expr: sum(kube_validatingwebhookconfiguration_info{validatingwebhookconfiguration=~"kyverno-.*"}) by (cluster_id) > 0 and sum(kube_deployment_status_replicas{deployment=~"kyverno|kyverno-admission-controller"}) by (cluster_id) == 0 for: 15m labels: area: managedservices @@ -63,7 +63,7 @@ spec: annotations: description: 'Kyverno has been scaled down for too long.' opsrecipe: kyverno-scaled-down/ - expr: sum(kube_deployment_spec_replicas{deployment=~"kyverno|kyverno-kyverno-plugin|kyverno-policy-reporter"}) == 0 + expr: sum(kube_deployment_spec_replicas{deployment=~"kyverno|kyverno-kyverno-plugin|kyverno-policy-reporter"}) by (cluster_id) == 0 for: 4h labels: area: managedservices @@ -78,7 +78,7 @@ spec: annotations: description: "Kyverno's admission controller deployment must use at least 3 replicas, or be scaled to 0." opsrecipe: KyvernoWronglyScaled/ - expr: sum(kube_deployment_spec_replicas{deployment="kyverno"}) != 0 and sum(kube_deployment_spec_replicas{deployment="kyverno"}) < 3 + expr: sum(kube_deployment_spec_replicas{deployment="kyverno"}) by (cluster_id) != 0 and sum(kube_deployment_spec_replicas{deployment="kyverno"}) by (cluster_id) < 3 for: 1h labels: area: managedservices diff --git a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml index 6767fb0d7..9eb724803 100644 --- a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml @@ -15,9 +15,9 @@ spec: description: This alert checks that we have less than 10% errors on Loki requests. opsrecipe: loki/ expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (cluster_id, namespace, job, route) / - sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) + sum(rate(loki_request_duration_seconds_count[1m])) by (cluster_id, namespace, job, route) > 10 for: 120m labels: @@ -36,7 +36,7 @@ spec: description: This alert checks that we have no panic errors on Loki. opsrecipe: loki/ expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + sum(increase(loki_panic_total[10m])) by (cluster_id, namespace, job) > 0 labels: area: managedservices cancel_if_apiserver_down: "true" diff --git a/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml index f9903f286..e4bc26f81 100644 --- a/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml @@ -15,7 +15,7 @@ spec: annotations: description: '{{`Management cluster {{ $labels.cluster_id }} has less than 3 nodes.`}}' opsrecipe: management-cluster-less-than-three-workers/ - expr: sum(kubelet_node_name{cluster_type="management_cluster"} * on (node) kube_node_role{role="worker", cluster_type="management_cluster"}) < 3 + expr: sum(kubelet_node_name{cluster_type="management_cluster"} * on (cluster_id, node) kube_node_role{role="worker", cluster_type="management_cluster"}) by (cluster_id) < 3 for: 1h labels: area: kaas @@ -26,7 +26,7 @@ spec: - alert: ManagementClusterMissingNodes annotations: description: '{{`Management cluster {{ $labels.cluster_id }} has less than 4 minimum nodes.`}}' - expr: sum(kube_node_status_condition{cluster_type="management_cluster", condition="Ready", status="true"}) < 4 + expr: sum(kube_node_status_condition{cluster_type="management_cluster", condition="Ready", status="true"}) by (cluster_id) < 4 for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml index b22ab1d5c..850221c15 100644 --- a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml @@ -12,7 +12,7 @@ spec: - alert: MimirComponentDown annotations: description: '{{`Mimir component : {{ $labels.service }} is down.`}}' - expr: count(up{app="mimir"} == 0) by (service) > 0 + expr: count(up{app="mimir"} == 0) by (cluster_id, service) > 0 for: 5m labels: area: managedservices diff --git a/helm/prometheus-rules/templates/alerting-rules/node.management_cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/node.management_cluster.rules.yml index 104b73d5d..84f43b47b 100644 --- a/helm/prometheus-rules/templates/alerting-rules/node.management_cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/node.management_cluster.rules.yml @@ -25,7 +25,7 @@ spec: # relabelling 'ip' to 'label_ip' to match against 'kube_node_labels'. annotations: description: '{{`Node {{ $labels.label_ip }} status is flapping under load.`}}' - expr: label_replace(node_load15{cluster_type="management_cluster"} / count(count(node_cpu{cluster_type="management_cluster"}) without (mode)) without (cpu) >= 2, "label_ip", "$1", "ip", "(.*)" ) unless on (label_ip) kube_node_labels{cluster_type="management_cluster"} and on (ip) changes(kube_node_status_condition{cluster_type="management_cluster", condition="Ready", status="true"}[30m]) >= 6 + expr: label_replace(node_load15{cluster_type="management_cluster"} / count(count(node_cpu_seconds_total{cluster_type="management_cluster"}) without (mode)) without (cpu) >= 2, "label_ip", "$1", "ip", "(.*)" ) unless on (label_ip) kube_node_labels{cluster_type="management_cluster"} and on (ip) changes(kube_node_status_condition{cluster_type="management_cluster", condition="Ready", status="true"}[30m]) >= 6 for: 10m labels: area: kaas @@ -90,7 +90,7 @@ spec: - alert: MachineLoadTooHigh annotations: description: '{{`Machine {{ $labels.instance }} CPU load is too high.`}}' - expr: node_load5{cluster_type="management_cluster"} > 2 * count(node_cpu{cluster_type="management_cluster", mode="idle"}) without (cpu,mode) + expr: node_load5{cluster_type="management_cluster"} > 2 * count(node_cpu_seconds_total{cluster_type="management_cluster", mode="idle"}) without (cpu,mode) for: 3m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml index aa3dd0d9d..949c00ece 100644 --- a/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml @@ -27,6 +27,7 @@ spec: severity: notify team: {{ include "providerTeam" . }} topic: kubernetes + {{- if eq .Values.managementCluster.provider.kind "aws" }} - alert: AWSWorkloadClusterNodeTooManyAutoTermination annotations: description: '{{`Cluster {{ $labels.cluster_id }} has too many nodes terminated by node auto termination feature in a short time.`}}' @@ -41,6 +42,7 @@ spec: severity: page team: phoenix topic: kubernetes + {{- end }} - alert: NodeStateFlappingUnderLoad # Check if the kubelet status is flapping, unless the node is under load. # It helps to read this rule from the bottom upwards. @@ -53,7 +55,7 @@ spec: # relabelling 'ip' to 'label_ip' to match against 'kube_node_labels'. annotations: description: '{{`Node {{ $labels.label_ip }} status is flapping under load.`}}' - expr: label_replace( node_load15 / count(count(node_cpu) without (mode)) without (cpu) >= 2, "label_ip", "$1", "ip", "(.*)" ) unless on (label_ip) kube_node_labels and on (ip) changes(kube_node_status_condition{condition="Ready", status="true"}[30m]) >= 6 + expr: label_replace(node_load15{cluster_type="workload_cluster"} / count(count(node_cpu_seconds_total{cluster_type="workload_cluster"}) without (mode)) without (cpu) >= 2, "label_ip", "$1", "ip", "(.*)" ) unless on (cluster_id, label_ip) kube_node_labels{cluster_type="workload_cluster"} and on (cluster_id, ip) changes(kube_node_status_condition{cluster_type="workload_cluster", condition="Ready", status="true"}[30m]) >= 6 for: 10m labels: area: kaas @@ -68,7 +70,7 @@ spec: # in the last hour. annotations: description: '{{`Node {{ $labels.ip }} has constant OOM kills.`}}' - expr: kube_pod_container_status_restarts_total{namespace=~"(giantswarm|kube-system)"} - kube_pod_container_status_restarts_total offset 1h >= 1 AND ignoring(reason) kube_pod_container_status_last_terminated_reason{reason='OOMKilled'} > 0 + expr: kube_pod_container_status_restarts_total{cluster_type="workload_cluster", namespace=~"(giantswarm|kube-system)"} - kube_pod_container_status_restarts_total{cluster_type="workload_cluster"} offset 1h >= 1 AND ignoring(reason) kube_pod_container_status_last_terminated_reason{cluster_type="workload_cluster", reason="OOMKilled"} > 0 for: 10m labels: area: kaas @@ -82,7 +84,7 @@ spec: annotations: description: '{{`Node {{ $labels.node }} reports a connection usage above 85% for the last 15 minutes.`}}' opsrecipe: node-conntrack-limits/ - expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit >= 0.85 + expr: node_nf_conntrack_entries{cluster_type="workload_cluster"} / node_nf_conntrack_entries_limit{cluster_type="workload_cluster"} >= 0.85 for: 15m labels: area: kaas @@ -94,7 +96,7 @@ spec: annotations: description: '{{`Machine {{ $labels.instance }} entropy is too low.`}}' opsrecipe: low-entropy/ - expr: node_entropy_available_bits < 250 + expr: node_entropy_available_bits{cluster_type="workload_cluster"} < 250 for: 10m labels: area: kaas @@ -106,7 +108,7 @@ spec: annotations: description: '{{`Machine {{ $labels.instance }} has too many allocated file descriptors.`}}' opsrecipe: high-number-file-descriptors/ - expr: node_filefd_allocated / node_filefd_maximum * 100 > 80 + expr: node_filefd_allocated{cluster_type="workload_cluster"} / node_filefd_maximum{cluster_type="workload_cluster"} * 100 > 80 for: 15m labels: area: kaas @@ -144,7 +146,7 @@ spec: ( node_memory_MemFree_bytes{cluster_type="workload_cluster"} + node_memory_Cached_bytes{cluster_type="workload_cluster"} ) < 2147483648) - and on (node) kube_node_role{role=~"control-plane|master"} + and on (cluster_id, node) kube_node_role{cluster_type="workload_cluster", role=~"control-plane|master"} for: 60m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/operatorkit.rules.yml b/helm/prometheus-rules/templates/alerting-rules/operatorkit.rules.yml index 09f73a963..d7f02f364 100644 --- a/helm/prometheus-rules/templates/alerting-rules/operatorkit.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/operatorkit.rules.yml @@ -77,7 +77,7 @@ spec: annotations: description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}' opsrecipe: operator-not-reconciling/ - expr: (sum by (instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app=~"aws-operator.+|cluster-operator.+"}[10m])) == 0 and on (instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) + expr: (sum by (cluster_id, instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app=~"aws-operator.+|cluster-operator.+"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) for: 20m labels: area: kaas @@ -105,7 +105,7 @@ spec: annotations: description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}' opsrecipe: operator-not-reconciling/ - expr: (sum by (instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app=~"node-operator"}[10m])) == 0 and on (instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) + expr: (sum by (cluster_id, instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app=~"node-operator"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) for: 20m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml index 3f6cf73c5..256711de2 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml @@ -17,7 +17,7 @@ spec: summary: Prometheus agent fails to send samples to remote write endpoint. opsrecipe: prometheus-agent/ dashboard: promRW001/prometheus-remote-write - # expr: count(absent_over_time(up{instance="prometheus-agent"}[10m])) + {{- if not .Values.mimir.enabled }} expr: |- max_over_time( sum by (cluster_type, cluster_id, installation, instance, service) @@ -27,6 +27,20 @@ spec: absent(up{instance="prometheus-agent"}) == 1 )[5m:] ) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(up{job="prometheus-agent"} > 0) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 20m labels: area: empowerment @@ -44,7 +58,7 @@ spec: summary: Prometheus agent fails to send samples to remote write endpoint. opsrecipe: prometheus-agent/ dashboard: promRW001/prometheus-remote-write - # expr: count(absent_over_time(up{instance="prometheus-agent"}[10m])) + {{- if not .Values.mimir.enabled }} expr: |- max_over_time( sum by (cluster_type, cluster_id, installation, instance, service) @@ -54,6 +68,20 @@ spec: absent(up{instance="prometheus-agent"}) == 1 )[5m:] ) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="Ready", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(up{job="prometheus-agent"} > 0) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 1m labels: area: empowerment @@ -71,23 +99,18 @@ spec: summary: Prometheus agent is missing shards. opsrecipe: prometheus-agent/ expr: |- - max_over_time(sum( + max_over_time(sum by (cluster_id)( count( ## number of remotes that are not mimir or grafana-cloud prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} - ) + ) by (cluster_id) != sum( ## number of shards defined in the Prometheus CR prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} - or - ( - # if there is only 1 shard, there is no shard metric so we use the replicas metric - absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}) - and on(controller, name) - prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} - ) - ) + # if there is only 1 shard, there is no shard metric so we use the replicas metric + or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + ) by (cluster_id) )[5m:]) for: 20m labels: @@ -107,23 +130,18 @@ spec: summary: Prometheus agent is missing shards. opsrecipe: prometheus-agent/ expr: |- - max_over_time(sum( + max_over_time(sum by (cluster_id)( count( ## number of remotes that are not mimir or grafana-cloud prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} - ) + ) by (cluster_id) != sum( ## number of shards defined in the Prometheus CR prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} - or - ( - # if there is only 1 shard, there is no shard metric so we use the replicas metric - absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}) - and on(controller, name) - prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} - ) - ) + # if there is only 1 shard, there is no shard metric so we use the replicas metric + or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + ) by (cluster_id) )[5m:]) for: 1m labels: diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml index 90f6e4359..842d5aac0 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml @@ -33,7 +33,7 @@ spec: - alert: PrometheusOperatorListErrors annotations: description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. - expr: (sum by (cluster_id, controller,namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4 + expr: (sum by (cluster_id, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4 for: 15m labels: area: empowerment @@ -88,7 +88,7 @@ spec: - alert: PrometheusOperatorNotReady annotations: description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources. - expr: min by(cluster_id, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0) + expr: min by (cluster_id, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0) for: 5m labels: area: empowerment diff --git a/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml b/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml index 63b6d2097..f11abe93b 100644 --- a/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml @@ -13,7 +13,7 @@ spec: annotations: description: '{{`Scraping of all promtail pods to check if one failed every 5 minutes.`}}' opsrecipe: promtail-is-not-running/ - expr: count(up{container="promtail"} == 0) > 0 + expr: count(up{container="promtail"} == 0) by (cluster_id) > 0 for: 5m labels: area: "empowerment" @@ -29,7 +29,7 @@ spec: description: This alert checks if that the amount of failed requests is below 10% for promtail opsrecipe: promtail-requests-are-failing/ expr: | - 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 + 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (cluster_id, namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (cluster_id, namespace, job, route, instance) > 10 for: 15m labels: area: "empowerment" diff --git a/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml b/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml index 3e5e5c195..e422c0c2c 100644 --- a/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml @@ -19,21 +19,21 @@ spec: label_replace( ( slo_errors_per_request:ratio_rate1h{service!~"efk-.*|.*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*"} - > on (service) group_left () + > on (cluster_id, service) group_left () slo_threshold_high and slo_errors_per_request:ratio_rate5m{service!~"efk-.*|.*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*"} - > on (service) group_left () + > on (cluster_id, service) group_left () slo_threshold_high ) or ( slo_errors_per_request:ratio_rate6h{service!~"efk-.*|.*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*"} - > on (service) group_left () + > on (cluster_id, service) group_left () slo_threshold_low and slo_errors_per_request:ratio_rate30m{service!~"efk-.*|.*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*"} - > on (service) group_left () + > on (cluster_id, service) group_left () slo_threshold_low ), "team", diff --git a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml index ce09c1305..6756a633f 100644 --- a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml @@ -30,7 +30,7 @@ spec: # This alert triggers when the silence operator sync job did not schedule for more than 1 day # or if the job did not run successfully at least once in the last day expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="silence-operator-sync", cluster_type="management_cluster"}) > 86400 - or count(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync.+", cluster_type="management_cluster"}[1d]) == 1) == 0 + or count(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync.+", cluster_type="management_cluster"}[1d]) == 1) by (cluster_id) == 0 labels: area: empowerment severity: page diff --git a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml index 0daee50a3..93a5a1257 100644 --- a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml @@ -12,7 +12,7 @@ spec: - alert: SlothDown annotations: description: 'Sloth is down.' - expr: count(up{app="sloth"} == 0) > 0 + expr: count(up{app="sloth"} == 0) by (cluster_id) > 0 for: 5m labels: area: managedservices diff --git a/test/tests/providers/global/kyverno.all.rules.test.yml b/test/tests/providers/global/kyverno.all.rules.test.yml index 6b89a5071..b5044d4a2 100644 --- a/test/tests/providers/global/kyverno.all.rules.test.yml +++ b/test/tests/providers/global/kyverno.all.rules.test.yml @@ -18,10 +18,10 @@ tests: - series: 'kube_deployment_spec_replicas{app="kube-state-metrics", cluster_id="gremlin", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="kyverno", endpoint="http", installation="gremlin", instance="10.0.135.241:8080", job="kube-state-metrics", namespace="kyverno", node="master-00000y", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-qn6sb", prometheus="kube-system/prometheus-agent", prometheus_replica="prometheus-prometheus-agent-0", provider="aws", region="germanywestcentral", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' values: "0+0x240 1+0x70" # Kyverno admission reports - - series: 'aggregation:kyverno_resource_counts{kind="admissionreports.kyverno.io"}' + - series: 'aggregation:kyverno_resource_counts{cluster_id="gremlin", kind="admissionreports.kyverno.io"}' values: "0+1000x30 30000+1500x30" # Kyverno updaterequests - - series: 'aggregation:kyverno_resource_counts{kind="updaterequests.kyverno.io"}' + - series: 'aggregation:kyverno_resource_counts{cluster_id="gremlin", kind="updaterequests.kyverno.io"}' values: "0+100x15 5000+1500x30" alert_rule_test: # Webhooks alert @@ -30,6 +30,7 @@ tests: exp_alerts: - exp_labels: area: managedservices + cluster_id: gremlin severity: page team: shield topic: kyverno @@ -46,6 +47,7 @@ tests: exp_alerts: - exp_labels: area: managedservices + cluster_id: gremlin severity: page team: shield topic: kyverno @@ -63,6 +65,7 @@ tests: exp_alerts: - exp_labels: area: managedservices + cluster_id: gremlin severity: notify team: shield topic: kyverno @@ -80,6 +83,7 @@ tests: exp_alerts: - exp_labels: area: managedservices + cluster_id: gremlin severity: notify team: shield topic: kyverno @@ -96,6 +100,7 @@ tests: exp_alerts: - exp_labels: area: managedservices + cluster_id: gremlin severity: notify team: shield topic: kyverno diff --git a/test/tests/providers/global/loki.all.rules.test.yml b/test/tests/providers/global/loki.all.rules.test.yml index 3aa69fea1..03bb95fe6 100644 --- a/test/tests/providers/global/loki.all.rules.test.yml +++ b/test/tests/providers/global/loki.all.rules.test.yml @@ -14,7 +14,6 @@ tests: - series: 'loki_request_duration_seconds_count{app="loki-distributor", cluster_id="zj88t", cluster_type="workload_cluster", container="distributor", customer="giantswarm", installation="gorilla", instance="10.7.75.90:3100", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", route="loki_api_v1_push", service_priority="highest", status_code="503", ws="false"}' values: "0+0x20 0+30x160" # After 20 minutes, we also have 0.5 rq/s failing alert_rule_test: - - alertname: LokiRequestPanics eval_time: 15m # should be OK after 15 minutes exp_alerts: @@ -29,6 +28,7 @@ tests: cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" + cluster_id: zj88t job: zj88t-prometheus/workload-zj88t/0 namespace: loki severity: page @@ -55,6 +55,7 @@ tests: cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" + cluster_id: zj88t job: zj88t-prometheus/workload-zj88t/0 namespace: loki route: loki_api_v1_push diff --git a/test/tests/providers/global/mimir.rules.test.yml b/test/tests/providers/global/mimir.rules.test.yml index 2c4787806..0067276ad 100644 --- a/test/tests/providers/global/mimir.rules.test.yml +++ b/test/tests/providers/global/mimir.rules.test.yml @@ -28,5 +28,6 @@ tests: cancel_if_cluster_status_updating: "true" cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" + cluster_id: gauss exp_annotations: description: "Mimir component : mimir-ingester is down." diff --git a/test/tests/providers/global/prometheus-agent.rules.test.yml b/test/tests/providers/global/prometheus-agent.rules.test.yml index 437313d5e..c4602f313 100644 --- a/test/tests/providers/global/prometheus-agent.rules.test.yml +++ b/test/tests/providers/global/prometheus-agent.rules.test.yml @@ -114,6 +114,7 @@ tests: exp_alerts: - exp_labels: area: empowerment + cluster_id: test01 severity: page team: atlas topic: observability @@ -131,6 +132,7 @@ tests: exp_alerts: - exp_labels: area: empowerment + cluster_id: test01 severity: none team: atlas topic: observability @@ -148,6 +150,7 @@ tests: exp_alerts: - exp_labels: area: empowerment + cluster_id: test01 severity: page team: atlas topic: observability @@ -165,6 +168,99 @@ tests: exp_alerts: - exp_labels: area: empowerment + cluster_id: test01 + severity: none + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Prometheus agent is missing shards." + opsrecipe: "prometheus-agent/" + summary: "Prometheus agent is missing shards." + - alertname: PrometheusAgentShardsMissing + eval_time: 130m + - alertname: PrometheusAgentShardsMissingInhibition + eval_time: 130m + # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric + - interval: 1m + input_series: + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x180" + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x180" + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x180" + - series: 'prometheus_operator_spec_replicas{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' + values: '3+0x60 5+0x60 3+0x60' + alert_rule_test: + - alertname: PrometheusAgentShardsMissing + eval_time: 40m + - alertname: PrometheusAgentShardsMissingInhibition + eval_time: 40m + - alertname: PrometheusAgentShardsMissing + eval_time: 100m + exp_alerts: + - exp_labels: + area: empowerment + cluster_id: test01 + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Prometheus agent is missing shards." + opsrecipe: "prometheus-agent/" + summary: "Prometheus agent is missing shards." + - alertname: PrometheusAgentShardsMissingInhibition + eval_time: 100m + exp_alerts: + - exp_labels: + area: empowerment + cluster_id: test01 + severity: none + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Prometheus agent is missing shards." + opsrecipe: "prometheus-agent/" + summary: "Prometheus agent is missing shards." + - alertname: PrometheusAgentShardsMissing + eval_time: 125m + exp_alerts: + - exp_labels: + area: empowerment + cluster_id: test01 + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Prometheus agent is missing shards." + opsrecipe: "prometheus-agent/" + summary: "Prometheus agent is missing shards." + - alertname: PrometheusAgentShardsMissingInhibition + eval_time: 125m + exp_alerts: + - exp_labels: + area: empowerment + cluster_id: test01 severity: none team: atlas topic: observability diff --git a/test/tests/providers/global/promtail.rules.test.yml b/test/tests/providers/global/promtail.rules.test.yml index 0bf7ca3be..724a4b6a7 100644 --- a/test/tests/providers/global/promtail.rules.test.yml +++ b/test/tests/providers/global/promtail.rules.test.yml @@ -26,6 +26,7 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" + cluster_id: gauss severity: page team: atlas topic: observability @@ -41,6 +42,7 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" + cluster_id: gauss severity: page team: atlas topic: observability @@ -57,6 +59,7 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" + cluster_id: gauss severity: page team: atlas topic: observability @@ -87,6 +90,7 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" + cluster_id: gauss severity: page team: atlas topic: observability @@ -101,6 +105,7 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" + cluster_id: gauss severity: page team: atlas topic: observability diff --git a/test/tests/providers/global/sloth.rules.test.yml b/test/tests/providers/global/sloth.rules.test.yml index c3b3f518c..05915b9fb 100644 --- a/test/tests/providers/global/sloth.rules.test.yml +++ b/test/tests/providers/global/sloth.rules.test.yml @@ -18,6 +18,7 @@ tests: exp_alerts: - exp_labels: area: managedservices + cluster_id: gauss severity: page team: atlas topic: observability