From f215b58da242c7abbdccf15774b2517124ab9a19 Mon Sep 17 00:00:00 2001 From: charlyF Date: Thu, 16 Nov 2017 11:59:05 +0100 Subject: [PATCH] updating metadata, adding reason whitelists --- kubernetes_state/check.py | 34 ++++++++++++++++--- .../ci/fixtures/prometheus/prometheus.txt | 9 ++++- kubernetes_state/metadata.csv | 2 ++ 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/kubernetes_state/check.py b/kubernetes_state/check.py index 6d025c547e5f8..727e732331e26 100644 --- a/kubernetes_state/check.py +++ b/kubernetes_state/check.py @@ -299,15 +299,41 @@ def kube_pod_status_phase(self, message, **kwargs): def kube_pod_container_status_waiting_reason(self, message, **kwargs): metric_name = self.NAMESPACE + '.container.status_report.count.waiting' + whitelisted_reasons = {"ErrImagePull"} + reason = True + tags = [] for metric in message.metric: - tags = [self._format_tag(label.name, label.value) for label in metric.label] - self.count(metric_name, metric.gauge.value, tags) + for label in metric.label: + if label.name == "reason": + if label.value in whitelisted_reasons: + tags.append(self._format_tag(label.name, label.value)) + else: + reason = False + elif label.name == "container": + tags.append(self._format_tag("kube_container_name", label.value)) + else label.name == "namespace": + tags.append(self._format_tag(label.name, label.value)) + if reason: + self.count(metric_name, metric.gauge.value, tags) def kube_pod_container_status_terminated_reason(self, message, **kwargs): metric_name = self.NAMESPACE + '.container.status_report.count.terminated' + whitelisted_reasons = {"OOMKilled","ContainerCannotRun","Error"} + reason = True + tags = [] for metric in message.metric: - tags = [self._format_tag(label.name, label.value) for label in metric.label] - self.count(metric_name, metric.gauge.value, tags) + for label in metric.label: + if label.name == "reason": + if label.value in whitelisted_reasons: + tags.append(self._format_tag(label.name, label.value)) + else: + reason = False + elif label.name == "container": + tags.append(self._format_tag("kube_container_name", label.value)) + else label.name == "namespace": + tags.append(self._format_tag(label.name, label.value)) + if reason: + self.count(metric_name, metric.gauge.value, tags) def kube_cronjob_next_schedule_time(self, message, **kwargs): """ Time until the next schedule """ diff --git a/kubernetes_state/ci/fixtures/prometheus/prometheus.txt b/kubernetes_state/ci/fixtures/prometheus/prometheus.txt index 05d5bb4775729..dc75f692b417a 100644 --- a/kubernetes_state/ci/fixtures/prometheus/prometheus.txt +++ b/kubernetes_state/ci/fixtures/prometheus/prometheus.txt @@ -376,6 +376,13 @@ kube_pod_container_status_terminated{container="should-run-once",namespace="defa kube_pod_container_status_terminated{container="sidecar",namespace="kube-system",pod="kube-dns-1326421443-hj4hx"} 0 kube_pod_container_status_terminated{container="task-pv-container",namespace="default",pod="task-pv-pod"} 0 kube_pod_container_status_terminated{container="tiller",namespace="kube-system",pod="tiller-deploy-1651615695-dcphn"} 0 +kube_pod_container_status_terminated{container="container2",namespace="ns2",pod="pod2"} 1 +# HELP kube_pod_container_status_terminated_reason Describes the reason the container is currently in terminated state. +# TYPE kube_pod_container_status_terminated_reason gauge +kube_pod_container_status_terminated_reason{container="container2",namespace="ns2",pod="pod2",reason="Completed"} 0 +kube_pod_container_status_terminated_reason{container="container2",namespace="ns2",pod="pod2",reason="ContainerCannotRun"} 0 +kube_pod_container_status_terminated_reason{container="container2",namespace="ns2",pod="pod2",reason="Error"} 0 +kube_pod_container_status_terminated_reason{container="container2",namespace="ns2",pod="pod2",reason="OOMKilled"} 1 # HELP kube_pod_container_status_waiting Describes whether the container is currently in waiting state. # TYPE kube_pod_container_status_waiting gauge kube_pod_container_status_waiting{container="dd-k8state",namespace="default",pod="jaundiced-numbat-dd-k8state-b6s77"} 0 @@ -427,7 +434,7 @@ kube_pod_container_status_waiting_reason{container="sidecar",namespace="kube-sys kube_pod_container_status_waiting_reason{container="task-pv-container",namespace="default",pod="task-pv-pod",reason="ContainerCreating"} 0 kube_pod_container_status_waiting_reason{container="task-pv-container",namespace="default",pod="task-pv-pod",reason="ErrImagePull"} 0 kube_pod_container_status_waiting_reason{container="tiller",namespace="kube-system",pod="tiller-deploy-1651615695-dcphn",reason="ContainerCreating"} 0 -kube_pod_container_status_waiting_reason{container="tiller",namespace="kube-system",pod="tiller-deploy-1651615695-dcphn",reason="ErrImagePull"} 0 +kube_pod_container_status_waiting_reason{container="tiller",namespace="kube-system",pod="tiller-deploy-1651615695-dcphn",reason="ErrImagePull"} 1 # HELP kube_pod_created Unix creation timestamp # TYPE kube_pod_created gauge kube_pod_created{namespace="default",pod="failingtest-f585bbd4-2fsml"} 1.510059371e+09 diff --git a/kubernetes_state/metadata.csv b/kubernetes_state/metadata.csv index 61cb9a2b4a89c..acebf68db841e 100644 --- a/kubernetes_state/metadata.csv +++ b/kubernetes_state/metadata.csv @@ -2,7 +2,9 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation kubernetes_state.container.ready,gauge,,,,Whether the containers readiness check succeeded,0,kubernetes,k8s_state.container.rdy kubernetes_state.container.running,gauge,,,,Whether the container is currently in running state,0,kubernetes,k8s_state.container.running kubernetes_state.container.terminated,gauge,,,,Whether the container is currently in terminated state,0,kubernetes,k8s_state.container.term +kubernetes_state.container.status_report.count.terminated,count,,,,Count of the containers currently reporting a in terminated state with the reason as a tag,-1,k8s_state.container.status_report.count.term kubernetes_state.container.waiting,gauge,,,,Whether the container is currently in waiting state,0,kubernetes,k8s_state.container.wait +kubernetes_state.container.status_report.count.waiting,count,,,,Count of the containers currently reporting a in waiting state with the reason as a tag,-1,k8s_state.container.status_report.count.wait kubernetes_state.container.gpu.request,gauge,,,The number of requested gpu devices by a container,0,kubernetes,k8s_state.container.gpu.request kubernetes_state.container.gpu.limit,gauge,,,The limit on gpu devices to be used by a container,0,kubernetes,k8s_state.container.gpu.limit kubernetes_state.container.restarts,gauge,,,,The number of restarts per container,-1,kubernetes,k8s_state.container.restarts