diff --git a/Makefile b/Makefile
index a8fcdbcf10..b94ad85f5e 100644
--- a/Makefile
+++ b/Makefile
@@ -49,7 +49,7 @@ doccheck: generate
@echo "- Checking if the generated documentation is up to date..."
@git diff --exit-code
@echo "- Checking if the documentation is in sync with the code..."
- @grep -hoE '(kube_[^ |]+)' docs/* --exclude=README.md| sort -u > documented_metrics
+ @grep -hoE '(\| kube_[^ |]+)' docs/* --exclude=README.md| sed -E 's/\| //g' | sort -u > documented_metrics
@find internal/store -type f -not -name '*_test.go' -exec sed -nE 's/.*"(kube_[^"]+)"/\1/p' {} \; | sed -E 's/,//g' | sort -u > code_metrics
@diff -u0 code_metrics documented_metrics || (echo "ERROR: Metrics with - are present in code but missing in documentation, metrics with + are documented but not found in code."; exit 1)
@echo OK
diff --git a/docs/pod-metrics.md b/docs/pod-metrics.md
index feefb39d23..bc6eae3391 100644
--- a/docs/pod-metrics.md
+++ b/docs/pod-metrics.md
@@ -21,7 +21,8 @@
| kube_pod_container_status_restarts_total | Counter | `container`=<container-name>
`namespace`=<pod-namespace>
`pod`=<pod-name> | STABLE |
| kube_pod_container_resource_requests | Gauge | `resource`=<resource-name>
`unit`=<resource-unit>
`container`=<container-name>
`pod`=<pod-name>
`namespace`=<pod-namespace>
`node`=< node-name> | STABLE |
| kube_pod_container_resource_limits | Gauge | `resource`=<resource-name>
`unit`=<resource-unit>
`container`=<container-name>
`pod`=<pod-name>
`namespace`=<pod-namespace>
`node`=< node-name> | STABLE |
-| kube_pod_created | Gauge | `pod`=<pod-name>
`namespace`=<pod-namespace> |
+| kube_pod_created | Gauge | `pod`=<pod-name>
`namespace`=<pod-namespace> | STABLE |
+| kube_pod_deleted | Gauge | `pod`=<pod-name>
`namespace`=<pod-namespace> | EXPERIMENTAL |
| kube_pod_restart_policy | Gauge | `pod`=<pod-name>
`namespace`=<pod-namespace>
`type`=<Always|Never|OnFailure> | STABLE |
| kube_pod_init_container_info | Gauge | `container`=<container-name>
`pod`=<pod-name>
`namespace`=<pod-namespace>
`image`=<image-name>
`image_id`=<image-id>
`container_id`=<containerid> | STABLE |
| kube_pod_init_container_status_waiting | Gauge | `container`=<container-name>
`pod`=<pod-name>
`namespace`=<pod-namespace> | STABLE |
@@ -35,5 +36,35 @@
| kube_pod_init_container_resource_limits | Gauge | `resource`=<resource-name>
`unit`=<resource-unit>
`container`=<container-name>
`pod`=<pod-name>
`namespace`=<pod-namespace>
`node`=< node-name> | STABLE |
| kube_pod_spec_volumes_persistentvolumeclaims_info | Gauge | `pod`=<pod-name>
`namespace`=<pod-namespace>
`volume`=<volume-name>
`persistentvolumeclaim`=<persistentvolumeclaim-claimname> | STABLE |
| kube_pod_spec_volumes_persistentvolumeclaims_readonly | Gauge | `pod`=<pod-name>
`namespace`=<pod-namespace>
`volume`=<volume-name>
`persistentvolumeclaim`=<persistentvolumeclaim-claimname> | STABLE |
+| kube_pod_status_reason | Gauge | `pod`=<pod-name>
`namespace`=<pod-namespace>
`reason`=<NodeLost\|Evicted\> | EXPERIMENTAL |
| kube_pod_status_scheduled_time | Gauge | `pod`=<pod-name>
`namespace`=<pod-namespace> | STABLE |
| kube_pod_status_unschedulable | Gauge | `pod`=<pod-name>
`namespace`=<pod-namespace> | STABLE |
+
+## Useful metrics queries
+
+### How to retrieve none standard Pod state
+
+It is not straightforward to get the Pod states for certain cases like "Terminating" and "Unknown" since it is not stored behind a field in the `Pod.Status`.
+
+So to get them, you will need to compose multiple metrics (like it is done in the `kubectl` command line code).
+
+For example:
+
+* To get the list of pods that are in the `Unknown` state, you can run the following promQL query: `count(kube_pod_status_phase{phase="Running"}) by (namespace, pod) * count(kube_pod_status_reason{reason="NodeLost"}) by(namespace, pod)`
+
+* For Pods in `Terminated` state: `count(kube_pod_status_phase{phase="Running"}) by (namespace, pod) * count(kube_pod_deleted) by (namespace, pod) * count(kube_pod_status_reason{reason!="NodeLost"})) by (namespace, pod)`
+
+Here is an example of a Prometheus rule that can be used to alert on a Pod that has been in the `Terminated` state for more than `5m`.
+
+```yaml
+groups:
+- name: Pod state
+ rules:
+ - alert: PodsBlockInTerminatingState
+ expr: count(kube_pod_status_phase{phase="Running"}) by (namespace, pod) * count(kube_pod_deleted) by (namespace, pod) * count(kube_pod_status_reason{reason!="NodeLost"})) by (namespace, pod) > 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ summary: Pod {{labels.namespace}}/{{labels.pod}} block in terminating state.
+```
diff --git a/internal/store/pod.go b/internal/store/pod.go
index 27ff669de7..ac2af0e7da 100644
--- a/internal/store/pod.go
+++ b/internal/store/pod.go
@@ -37,6 +37,7 @@ var (
descPodLabelsDefaultLabels = []string{"namespace", "pod"}
containerWaitingReasons = []string{"ContainerCreating", "CrashLoopBackOff", "CreateContainerConfigError", "ErrImagePull", "ImagePullBackOff", "CreateContainerError", "InvalidImageName"}
containerTerminatedReasons = []string{"OOMKilled", "Completed", "Error", "ContainerCannotRun", "DeadlineExceeded", "Evicted"}
+ podStatusReasons = []string{"NodeLost", "Evicted"}
podMetricFamilies = []generator.FamilyGenerator{
{
@@ -197,6 +198,26 @@ var (
}
}),
},
+ {
+ Name: "kube_pod_deleted",
+ Type: metric.Gauge,
+ Help: "Unix deletion timestamp",
+ GenerateFunc: wrapPodFunc(func(p *v1.Pod) *metric.Family {
+ ms := []*metric.Metric{}
+
+ if p.DeletionTimestamp != nil && !p.DeletionTimestamp.IsZero() {
+ ms = append(ms, &metric.Metric{
+ LabelKeys: []string{},
+ LabelValues: []string{},
+ Value: float64(p.DeletionTimestamp.Unix()),
+ })
+ }
+
+ return &metric.Family{
+ Metrics: ms,
+ }
+ }),
+ },
{
Name: "kube_pod_restart_policy",
Type: metric.Gauge,
@@ -354,6 +375,30 @@ var (
}
}),
},
+ {
+ Name: "kube_pod_status_reason",
+ Type: metric.Gauge,
+ Help: "The pod status reasons",
+ GenerateFunc: wrapPodFunc(func(p *v1.Pod) *metric.Family {
+ ms := []*metric.Metric{}
+
+ for _, reason := range podStatusReasons {
+ metric := &metric.Metric{}
+ metric.LabelKeys = []string{"reason"}
+ metric.LabelValues = []string{reason}
+ if p.Status.Reason == reason {
+ metric.Value = boolFloat64(true)
+ } else {
+ metric.Value = boolFloat64(false)
+ }
+ ms = append(ms, metric)
+ }
+
+ return &metric.Family{
+ Metrics: ms,
+ }
+ }),
+ },
{
Name: "kube_pod_container_info",
Type: metric.Gauge,
diff --git a/internal/store/pod_test.go b/internal/store/pod_test.go
index 122de3c237..e856cb6ce2 100644
--- a/internal/store/pod_test.go
+++ b/internal/store/pod_test.go
@@ -863,6 +863,32 @@ kube_pod_container_status_last_terminated_reason{container="container7",namespac
`,
MetricNames: []string{"kube_pod_created", "kube_pod_info", "kube_pod_start_time", "kube_pod_completion_time", "kube_pod_owner"},
},
+ {
+ Obj: &v1.Pod{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "pod1",
+ CreationTimestamp: metav1.Time{Time: time.Unix(1500000000, 0)},
+ Namespace: "ns1",
+ UID: "abc-123-xxx",
+ DeletionTimestamp: &metav1.Time{Time: time.Unix(1800000000, 0)},
+ },
+ Spec: v1.PodSpec{
+ NodeName: "node1",
+ PriorityClassName: "system-node-critical",
+ },
+ Status: v1.PodStatus{
+ HostIP: "1.1.1.1",
+ PodIP: "1.2.3.4",
+ StartTime: &metav1StartTime,
+ },
+ },
+ Want: `
+ # HELP kube_pod_deleted Unix deletion timestamp
+ # TYPE kube_pod_deleted gauge
+ kube_pod_deleted{namespace="ns1",pod="pod1"} 1.8e+09
+`,
+ MetricNames: []string{"kube_pod_deleted"},
+ },
{
Obj: &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
@@ -1055,14 +1081,58 @@ kube_pod_container_status_last_terminated_reason{container="container7",namespac
},
Want: `
# HELP kube_pod_status_phase The pods current phase.
+ # HELP kube_pod_status_reason The pod status reasons
# TYPE kube_pod_status_phase gauge
+ # TYPE kube_pod_status_reason gauge
kube_pod_status_phase{namespace="ns4",phase="Failed",pod="pod4"} 0
kube_pod_status_phase{namespace="ns4",phase="Pending",pod="pod4"} 0
kube_pod_status_phase{namespace="ns4",phase="Running",pod="pod4"} 0
kube_pod_status_phase{namespace="ns4",phase="Succeeded",pod="pod4"} 0
kube_pod_status_phase{namespace="ns4",phase="Unknown",pod="pod4"} 1
+ kube_pod_status_reason{namespace="ns4",pod="pod4",reason="Evicted"} 0
+ kube_pod_status_reason{namespace="ns4",pod="pod4",reason="NodeLost"} 1
`,
- MetricNames: []string{"kube_pod_status_phase"},
+ MetricNames: []string{"kube_pod_status_phase", "kube_pod_status_reason"},
+ },
+ {
+ Obj: &v1.Pod{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "pod4",
+ Namespace: "ns4",
+ DeletionTimestamp: &metav1.Time{},
+ },
+ Status: v1.PodStatus{
+ Phase: v1.PodRunning,
+ Reason: "Evicted",
+ },
+ },
+ Want: `
+ # HELP kube_pod_status_reason The pod status reasons
+ # TYPE kube_pod_status_reason gauge
+ kube_pod_status_reason{namespace="ns4",pod="pod4",reason="Evicted"} 1
+ kube_pod_status_reason{namespace="ns4",pod="pod4",reason="NodeLost"} 0
+`,
+ MetricNames: []string{"kube_pod_status_reason"},
+ },
+ {
+ Obj: &v1.Pod{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "pod4",
+ Namespace: "ns4",
+ DeletionTimestamp: &metav1.Time{},
+ },
+ Status: v1.PodStatus{
+ Phase: v1.PodRunning,
+ Reason: "other reason",
+ },
+ },
+ Want: `
+ # HELP kube_pod_status_reason The pod status reasons
+ # TYPE kube_pod_status_reason gauge
+ kube_pod_status_reason{namespace="ns4",pod="pod4",reason="Evicted"} 0
+ kube_pod_status_reason{namespace="ns4",pod="pod4",reason="NodeLost"} 0
+`,
+ MetricNames: []string{"kube_pod_status_reason"},
},
{
Obj: &v1.Pod{
@@ -1535,7 +1605,7 @@ func BenchmarkPodStore(b *testing.B) {
},
}
- expectedFamilies := 35
+ expectedFamilies := 37
for n := 0; n < b.N; n++ {
families := f(pod)
if len(families) != expectedFamilies {
diff --git a/main_test.go b/main_test.go
index a164f0feb0..023190445e 100644
--- a/main_test.go
+++ b/main_test.go
@@ -171,6 +171,8 @@ kube_pod_labels{namespace="default",pod="pod0"} 1
# HELP kube_pod_created Unix creation timestamp
# TYPE kube_pod_created gauge
kube_pod_created{namespace="default",pod="pod0"} 1.5e+09
+# HELP kube_pod_deleted Unix deletion timestamp
+# TYPE kube_pod_deleted gauge
# HELP kube_pod_restart_policy Describes the restart policy in use by this pod.
# TYPE kube_pod_restart_policy gauge
kube_pod_restart_policy{namespace="default",pod="pod0",type="Always"} 1
@@ -187,6 +189,10 @@ kube_pod_status_phase{namespace="default",pod="pod0",phase="Running"} 1
kube_pod_status_phase{namespace="default",pod="pod0",phase="Unknown"} 0
# HELP kube_pod_status_ready Describes whether the pod is ready to serve requests.
# TYPE kube_pod_status_ready gauge
+# HELP kube_pod_status_reason The pod status reasons
+# TYPE kube_pod_status_reason gauge
+kube_pod_status_reason{namespace="default",pod="pod0",reason="Evicted"} 0
+kube_pod_status_reason{namespace="default",pod="pod0",reason="NodeLost"} 0
# HELP kube_pod_status_scheduled Describes the status of the scheduling process for the pod.
# TYPE kube_pod_status_scheduled gauge
# HELP kube_pod_container_info Information about a container in a pod.