-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from brancz/k8s-components
monitor and alert on k8s components
- Loading branch information
Showing
8 changed files
with
864 additions
and
8 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,398 @@ | ||
### Container resources ### | ||
|
||
cluster_namespace_controller_pod_container:spec_memory_limit_bytes = | ||
sum by (cluster,namespace,controller,pod_name,container_name) ( | ||
label_replace( | ||
container_spec_memory_limit_bytes{container_name!=""}, | ||
"controller", "$1", | ||
"pod_name", "^(.*)-[a-z0-9]+" | ||
) | ||
) | ||
|
||
cluster_namespace_controller_pod_container:spec_cpu_shares = | ||
sum by (cluster,namespace,controller,pod_name,container_name) ( | ||
label_replace( | ||
container_spec_cpu_shares{container_name!=""}, | ||
"controller", "$1", | ||
"pod_name", "^(.*)-[a-z0-9]+" | ||
) | ||
) | ||
|
||
cluster_namespace_controller_pod_container:cpu_usage:rate = | ||
sum by (cluster,namespace,controller,pod_name,container_name) ( | ||
label_replace( | ||
irate( | ||
container_cpu_usage_seconds_total{container_name!=""}[5m] | ||
), | ||
"controller", "$1", | ||
"pod_name", "^(.*)-[a-z0-9]+" | ||
) | ||
) | ||
|
||
cluster_namespace_controller_pod_container:memory_usage:bytes = | ||
sum by (cluster,namespace,controller,pod_name,container_name) ( | ||
label_replace( | ||
container_memory_usage_bytes{container_name!=""}, | ||
"controller", "$1", | ||
"pod_name", "^(.*)-[a-z0-9]+" | ||
) | ||
) | ||
|
||
cluster_namespace_controller_pod_container:memory_working_set:bytes = | ||
sum by (cluster,namespace,controller,pod_name,container_name) ( | ||
label_replace( | ||
container_memory_working_set_bytes{container_name!=""}, | ||
"controller", "$1", | ||
"pod_name", "^(.*)-[a-z0-9]+" | ||
) | ||
) | ||
|
||
cluster_namespace_controller_pod_container:memory_rss:bytes = | ||
sum by (cluster,namespace,controller,pod_name,container_name) ( | ||
label_replace( | ||
container_memory_rss{container_name!=""}, | ||
"controller", "$1", | ||
"pod_name", "^(.*)-[a-z0-9]+" | ||
) | ||
) | ||
|
||
cluster_namespace_controller_pod_container:memory_cache:bytes = | ||
sum by (cluster,namespace,controller,pod_name,container_name) ( | ||
label_replace( | ||
container_memory_cache{container_name!=""}, | ||
"controller", "$1", | ||
"pod_name", "^(.*)-[a-z0-9]+" | ||
) | ||
) | ||
|
||
cluster_namespace_controller_pod_container:disk_usage:bytes = | ||
sum by (cluster,namespace,controller,pod_name,container_name) ( | ||
label_replace( | ||
container_disk_usage_bytes{container_name!=""}, | ||
"controller", "$1", | ||
"pod_name", "^(.*)-[a-z0-9]+" | ||
) | ||
) | ||
|
||
cluster_namespace_controller_pod_container:memory_pagefaults:rate = | ||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( | ||
label_replace( | ||
irate( | ||
container_memory_failures_total{container_name!=""}[5m] | ||
), | ||
"controller", "$1", | ||
"pod_name", "^(.*)-[a-z0-9]+" | ||
) | ||
) | ||
|
||
cluster_namespace_controller_pod_container:memory_oom:rate = | ||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( | ||
label_replace( | ||
irate( | ||
container_memory_failcnt{container_name!=""}[5m] | ||
), | ||
"controller", "$1", | ||
"pod_name", "^(.*)-[a-z0-9]+" | ||
) | ||
) | ||
|
||
### Cluster resources ### | ||
|
||
cluster:memory_allocation:percent = | ||
100 * sum by (cluster) ( | ||
container_spec_memory_limit_bytes{pod_name!=""} | ||
) / sum by (cluster) ( | ||
machine_memory_bytes | ||
) | ||
|
||
cluster:memory_used:percent = | ||
100 * sum by (cluster) ( | ||
container_memory_usage_bytes{pod_name!=""} | ||
) / sum by (cluster) ( | ||
machine_memory_bytes | ||
) | ||
|
||
cluster:cpu_allocation:percent = | ||
100 * sum by (cluster) ( | ||
container_spec_cpu_shares{pod_name!=""} | ||
) / sum by (cluster) ( | ||
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores | ||
) | ||
|
||
cluster:node_cpu_use:percent = | ||
100 * sum by (cluster) ( | ||
rate(node_cpu{mode!="idle"}[5m]) | ||
) / sum by (cluster) ( | ||
machine_cpu_cores | ||
) | ||
|
||
### API latency ### | ||
|
||
# Raw metrics are in microseconds. Convert to seconds. | ||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = | ||
histogram_quantile( | ||
0.99, | ||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) | ||
) / 1e6 | ||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = | ||
histogram_quantile( | ||
0.9, | ||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) | ||
) / 1e6 | ||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = | ||
histogram_quantile( | ||
0.5, | ||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) | ||
) / 1e6 | ||
|
||
### Scheduling latency ### | ||
|
||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = | ||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 | ||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = | ||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 | ||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = | ||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 | ||
|
||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = | ||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 | ||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = | ||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 | ||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = | ||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 | ||
|
||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = | ||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 | ||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = | ||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 | ||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = | ||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 | ||
|
||
ALERT K8SNodeDown | ||
IF up{job="kubelets"} == 0 | ||
FOR 1h | ||
LABELS { | ||
service = "k8s", | ||
severity = "warning" | ||
} | ||
ANNOTATIONS { | ||
summary = "Kubelet cannot be scraped", | ||
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", | ||
} | ||
|
||
ALERT K8SNodeNotReady | ||
IF kube_node_status_ready{condition="true"} == 0 | ||
FOR 1h | ||
LABELS { | ||
service = "k8s", | ||
severity = "warning", | ||
} | ||
ANNOTATIONS { | ||
summary = "Node status is NotReady", | ||
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", | ||
} | ||
|
||
ALERT K8SManyNodesNotReady | ||
IF | ||
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 | ||
AND | ||
( | ||
count by (cluster) (kube_node_status_ready{condition="true"} == 0) | ||
/ | ||
count by (cluster) (kube_node_status_ready{condition="true"}) | ||
) > 0.2 | ||
FOR 1m | ||
LABELS { | ||
service = "k8s", | ||
severity = "critical", | ||
} | ||
ANNOTATIONS { | ||
summary = "Many K8s nodes are Not Ready", | ||
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", | ||
} | ||
|
||
ALERT K8SKubeletNodeExporterDown | ||
IF up{job="node-exporter"} == 0 | ||
FOR 15m | ||
LABELS { | ||
service = "k8s", | ||
severity = "warning" | ||
} | ||
ANNOTATIONS { | ||
summary = "Kubelet node_exporter cannot be scraped", | ||
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", | ||
} | ||
|
||
ALERT K8SKubeletDown | ||
IF absent(up{job="kubelets"}) or count by (cluster) (up{job="kubelets"} == 0) / count by (cluster) (up{job="kubelets"}) > 0.1 | ||
FOR 1h | ||
LABELS { | ||
service = "k8s", | ||
severity = "critical" | ||
} | ||
ANNOTATIONS { | ||
summary = "Many Kubelets cannot be scraped", | ||
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", | ||
} | ||
|
||
ALERT K8SApiserverDown | ||
IF up{job="kubernetes"} == 0 | ||
FOR 15m | ||
LABELS { | ||
service = "k8s", | ||
severity = "warning" | ||
} | ||
ANNOTATIONS { | ||
summary = "API server unreachable", | ||
description = "An API server could not be scraped.", | ||
} | ||
|
||
# Disable for non HA kubernetes setups. | ||
ALERT K8SApiserverDown | ||
IF absent({job="kubernetes"}) or count by(cluster) (up{job="kubernetes"} == 1) < 2 | ||
FOR 5m | ||
LABELS { | ||
service = "k8s", | ||
severity = "critical" | ||
} | ||
ANNOTATIONS { | ||
summary = "API server unreachable", | ||
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", | ||
} | ||
|
||
ALERT K8SSchedulerDown | ||
IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) | ||
FOR 5m | ||
LABELS { | ||
service = "k8s", | ||
severity = "critical", | ||
} | ||
ANNOTATIONS { | ||
summary = "Scheduler is down", | ||
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", | ||
} | ||
|
||
ALERT K8SControllerManagerDown | ||
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) | ||
FOR 5m | ||
LABELS { | ||
service = "k8s", | ||
severity = "critical", | ||
} | ||
ANNOTATIONS { | ||
summary = "Controller manager is down", | ||
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", | ||
} | ||
|
||
ALERT K8SMoreThanOneController | ||
IF count by (job,cluster) (up{job=~"kube-scheduler|kube-controller-manager"}) > 1 | ||
FOR 5m | ||
LABELS { | ||
service = "k8s", | ||
severity = "critical", | ||
} | ||
ANNOTATIONS { | ||
summary = "More than one controller node is active", | ||
description = "There is more than one {{ $labels.job }} managing the cluster. Cluster behaviour is undefined.", | ||
} | ||
|
||
ALERT K8SConntrackTableFull | ||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 | ||
FOR 10m | ||
LABELS { | ||
service = "k8s", | ||
severity = "warning" | ||
} | ||
ANNOTATIONS { | ||
summary = "Number of tracked connections is near the limit", | ||
description = "The nf_conntrack table is {{ $value }}% full.", | ||
} | ||
|
||
ALERT K8SConntrackTableFull | ||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 | ||
LABELS { | ||
service = "k8s", | ||
severity = "critical" | ||
} | ||
ANNOTATIONS { | ||
summary = "Number of tracked connections is near the limit", | ||
description = "The nf_conntrack table is {{ $value }}% full.", | ||
} | ||
|
||
# To catch the conntrack sysctl de-tuning when it happens | ||
ALERT K8SConntrackTuningMissing | ||
IF node_nf_conntrack_udp_timeout > 10 | ||
FOR 10m | ||
LABELS { | ||
service = "k8s", | ||
severity = "warning", | ||
} | ||
ANNOTATIONS { | ||
summary = "Node does not have the correct conntrack tunings", | ||
description = "Nodes keep un-setting the correct tunings, investigate when it happens.", | ||
} | ||
|
||
ALERT K8STooManyOpenFiles | ||
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50 | ||
FOR 10m | ||
LABELS { | ||
service = "k8s", | ||
severity = "warning" | ||
} | ||
ANNOTATIONS { | ||
summary = "{{ $labels.job }} has too many open file descriptors", | ||
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", | ||
} | ||
|
||
ALERT K8STooManyOpenFiles | ||
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80 | ||
FOR 10m | ||
LABELS { | ||
service = "k8s", | ||
severity = "critical" | ||
} | ||
ANNOTATIONS { | ||
summary = "{{ $labels.job }} has too many open file descriptors", | ||
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", | ||
} | ||
|
||
# Some verbs excluded because they are expected to be long-lasting: | ||
# WATCHLIST is long-poll, CONNECT is `kubectl exec`. | ||
ALERT K8SApiServerLatency | ||
IF histogram_quantile( | ||
0.99, | ||
sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST"}) | ||
) / 1e6 > 1.0 | ||
FOR 10m | ||
LABELS { | ||
service = "k8s", | ||
severity = "warning" | ||
} | ||
ANNOTATIONS { | ||
summary = "Kubernetes apiserver latency is high", | ||
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", | ||
} | ||
|
||
ALERT K8SApiServerEtcdAccessLatency | ||
IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 | ||
FOR 15m | ||
LABELS { | ||
service = "k8s", | ||
severity = "warning" | ||
} | ||
ANNOTATIONS { | ||
summary = "Access to etcd is slow", | ||
description = "99th percentile latency for apiserver to access etcd is higher than 1s.", | ||
} | ||
|
||
ALERT K8SKubeletTooManyPods | ||
IF kubelet_running_pod_count > 100 | ||
LABELS { | ||
service = "k8s", | ||
severity = "warning", | ||
} | ||
ANNOTATIONS { | ||
summary = "Kubelet is close to pod limit", | ||
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", | ||
} | ||
|
Oops, something went wrong.