diff --git a/assets/alerts/.gitkeep b/assets/alerts/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/assets/alerts/kubernetes.rules b/assets/alerts/kubernetes.rules new file mode 100644 index 0000000000..d6fcaf8aaa --- /dev/null +++ b/assets/alerts/kubernetes.rules @@ -0,0 +1,398 @@ +### Container resources ### + +cluster_namespace_controller_pod_container:spec_memory_limit_bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:spec_cpu_shares = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_cpu_shares{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:cpu_usage:rate = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + irate( + container_cpu_usage_seconds_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_working_set:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_working_set_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_rss:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_rss{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_cache:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_cache{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:disk_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_disk_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_pagefaults:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failures_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_oom:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failcnt{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +### Cluster resources ### + +cluster:memory_allocation:percent = + 100 * sum by (cluster) ( + container_spec_memory_limit_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + +cluster:memory_used:percent = + 100 * sum by (cluster) ( + container_memory_usage_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + +cluster:cpu_allocation:percent = + 100 * sum by (cluster) ( + container_spec_cpu_shares{pod_name!=""} + ) / sum by (cluster) ( + container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores + ) + +cluster:node_cpu_use:percent = + 100 * sum by (cluster) ( + rate(node_cpu{mode!="idle"}[5m]) + ) / sum by (cluster) ( + machine_cpu_cores + ) + +### API latency ### + +# Raw metrics are in microseconds. Convert to seconds. +cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile( + 0.99, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 +cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile( + 0.9, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 +cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile( + 0.5, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + +### Scheduling latency ### + +cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + +cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + +cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + +ALERT K8SNodeDown + IF up{job="kubelets"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", + } + +ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + +ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + +ALERT K8SKubeletNodeExporterDown + IF up{job="node-exporter"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet node_exporter cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", + } + +ALERT K8SKubeletDown + IF absent(up{job="kubelets"}) or count by (cluster) (up{job="kubelets"} == 0) / count by (cluster) (up{job="kubelets"}) > 0.1 + FOR 1h + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + } + +ALERT K8SApiserverDown + IF up{job="kubernetes"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "An API server could not be scraped.", + } + +# Disable for non HA kubernetes setups. +ALERT K8SApiserverDown + IF absent({job="kubernetes"}) or count by(cluster) (up{job="kubernetes"} == 1) < 2 + FOR 5m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + } + +ALERT K8SSchedulerDown + IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Scheduler is down", + description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + } + +ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } + +ALERT K8SMoreThanOneController + IF count by (job,cluster) (up{job=~"kube-scheduler|kube-controller-manager"}) > 1 + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "More than one controller node is active", + description = "There is more than one {{ $labels.job }} managing the cluster. Cluster behaviour is undefined.", + } + +ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + +ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + +# To catch the conntrack sysctl de-tuning when it happens +ALERT K8SConntrackTuningMissing + IF node_nf_conntrack_udp_timeout > 10 + FOR 10m + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node does not have the correct conntrack tunings", + description = "Nodes keep un-setting the correct tunings, investigate when it happens.", + } + +ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + +ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80 + FOR 10m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + +# Some verbs excluded because they are expected to be long-lasting: +# WATCHLIST is long-poll, CONNECT is `kubectl exec`. +ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST"}) + ) / 1e6 > 1.0 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } + +ALERT K8SApiServerEtcdAccessLatency + IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Access to etcd is slow", + description = "99th percentile latency for apiserver to access etcd is higher than 1s.", + } + +ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + } + diff --git a/hack/scripts/generate-configmaps.sh b/hack/scripts/generate-configmaps.sh index 925c17e8f9..a178878cfd 100755 --- a/hack/scripts/generate-configmaps.sh +++ b/hack/scripts/generate-configmaps.sh @@ -1,7 +1,7 @@ #!/bin/bash # Generate Alert Rules ConfigMap -kubectl create configmap --dry-run=true prometheus-k8s-alerts --from-file=assets/alerts/ -oyaml > manifests/prometheus/prometheus-k8s-alerts.yaml +kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/alerts/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml # Generate Dashboard ConfigMap kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-cm.yaml diff --git a/manifests/prometheus/prometheus-k8s-alerts.yaml b/manifests/prometheus/prometheus-k8s-alerts.yaml deleted file mode 100644 index 7786d71462..0000000000 --- a/manifests/prometheus/prometheus-k8s-alerts.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -data: - .gitkeep: "" -kind: ConfigMap -metadata: - creationTimestamp: null - name: prometheus-k8s-alerts diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml new file mode 100644 index 0000000000..a06550cfae --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -0,0 +1,406 @@ +apiVersion: v1 +data: + .gitkeep: "" + kubernetes.rules: |+ + ### Container resources ### + + cluster_namespace_controller_pod_container:spec_memory_limit_bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:spec_cpu_shares = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_cpu_shares{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:cpu_usage:rate = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + irate( + container_cpu_usage_seconds_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_working_set:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_working_set_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_rss:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_rss{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_cache:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_cache{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:disk_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_disk_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_pagefaults:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failures_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_oom:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failcnt{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + ### Cluster resources ### + + cluster:memory_allocation:percent = + 100 * sum by (cluster) ( + container_spec_memory_limit_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + + cluster:memory_used:percent = + 100 * sum by (cluster) ( + container_memory_usage_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + + cluster:cpu_allocation:percent = + 100 * sum by (cluster) ( + container_spec_cpu_shares{pod_name!=""} + ) / sum by (cluster) ( + container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores + ) + + cluster:node_cpu_use:percent = + 100 * sum by (cluster) ( + rate(node_cpu{mode!="idle"}[5m]) + ) / sum by (cluster) ( + machine_cpu_cores + ) + + ### API latency ### + + # Raw metrics are in microseconds. Convert to seconds. + cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile( + 0.99, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile( + 0.9, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile( + 0.5, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + + ### Scheduling latency ### + + cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + + cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + + cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + + ALERT K8SNodeDown + IF up{job="kubelets"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", + } + + ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + + ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + + ALERT K8SKubeletNodeExporterDown + IF up{job="node-exporter"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet node_exporter cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", + } + + ALERT K8SKubeletDown + IF absent(up{job="kubelets"}) or count by (cluster) (up{job="kubelets"} == 0) / count by (cluster) (up{job="kubelets"}) > 0.1 + FOR 1h + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + } + + ALERT K8SApiserverDown + IF up{job="kubernetes"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "An API server could not be scraped.", + } + + # Disable for non HA kubernetes setups. + ALERT K8SApiserverDown + IF absent({job="kubernetes"}) or count by(cluster) (up{job="kubernetes"} == 1) < 2 + FOR 5m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + } + + ALERT K8SSchedulerDown + IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Scheduler is down", + description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + } + + ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } + + ALERT K8SMoreThanOneController + IF count by (job,cluster) (up{job=~"kube-scheduler|kube-controller-manager"}) > 1 + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "More than one controller node is active", + description = "There is more than one {{ $labels.job }} managing the cluster. Cluster behaviour is undefined.", + } + + ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + + ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + + # To catch the conntrack sysctl de-tuning when it happens + ALERT K8SConntrackTuningMissing + IF node_nf_conntrack_udp_timeout > 10 + FOR 10m + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node does not have the correct conntrack tunings", + description = "Nodes keep un-setting the correct tunings, investigate when it happens.", + } + + ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + + ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80 + FOR 10m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + + # Some verbs excluded because they are expected to be long-lasting: + # WATCHLIST is long-poll, CONNECT is `kubectl exec`. + ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST"}) + ) / 1e6 > 1.0 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } + + ALERT K8SApiServerEtcdAccessLatency + IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Access to etcd is slow", + description = "99th percentile latency for apiserver to access etcd is higher than 1s.", + } + + ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + } + +kind: ConfigMap +metadata: + creationTimestamp: null + name: prometheus-k8s-rules