diff --git a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py index 9a21ffb4e7032..2ffb387082dc7 100644 --- a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py +++ b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py @@ -4,7 +4,7 @@ import re import time -from collections import defaultdict +from collections import defaultdict, Counter try: # Agent5 compatibility layer @@ -342,16 +342,32 @@ def _trim_job_tag(self, name): # Labels attached: namespace, pod # As a message the phase=Pending|Running|Succeeded|Failed|Unknown # From the phase the check will update its status + # Also submits as an aggregated count with minimal tags so it is + # visualisable over time per namespace and phase def kube_pod_status_phase(self, message, **kwargs): """ Phase a pod is in. """ + metric_name = self.NAMESPACE + '.pod.status_phase' # Will submit a service check which status is given by its phase. # More details about the phase in the message of the check. check_basename = self.NAMESPACE + '.pod.phase' + status_phase_counter = Counter() + for metric in message.metric: self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, tags=[self._label_to_tag("pod", metric.label), self._label_to_tag("namespace", metric.label)] + self.custom_tags) + # Counts aggregated cluster-wide to avoid no-data issues on pod churn, + # pod granularity available in the service checks + tags = [ + self._label_to_tag("namespace", metric.label), + self._label_to_tag("phase", metric.label) + ] + self.custom_tags + status_phase_counter[tuple(sorted(tags))] += metric.gauge.value + + for tags, count in status_phase_counter.iteritems(): + self.gauge(metric_name, count, tags=list(tags)) + def kube_pod_container_status_waiting_reason(self, message, **kwargs): metric_name = self.NAMESPACE + '.container.status_report.count.waiting' for metric in message.metric: diff --git a/kubernetes_state/metadata.csv b/kubernetes_state/metadata.csv index b7a7be51c605e..d2cc9e34c89a0 100644 --- a/kubernetes_state/metadata.csv +++ b/kubernetes_state/metadata.csv @@ -52,6 +52,7 @@ kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed b kubernetes_state.hpa.desired_replicas,gauge,,,,Desired number of replicas of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.desired_replicas kubernetes_state.pod.ready,gauge,,,,"In association with the `condition` tag, whether the pod is ready to serve requests, e.g. `condition:true` keeps the pods that are in a ready state",1,kubernetes,k8s_state.pod.ready kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled +kubernetes_state.pod.status_phase,gauge,,,,"To sum by `phase` to get number of pods in a given phase, and `namespace` to break this down by namespace",0,kubernetes,k8s_state.pod.status_phase kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy diff --git a/kubernetes_state/tests/test_kubernetes_state.py b/kubernetes_state/tests/test_kubernetes_state.py index caeb0b3440c12..d47873e9cf563 100644 --- a/kubernetes_state/tests/test_kubernetes_state.py +++ b/kubernetes_state/tests/test_kubernetes_state.py @@ -46,6 +46,7 @@ # pods NAMESPACE + '.pod.ready', NAMESPACE + '.pod.scheduled', + NAMESPACE + '.pod.status_phase', # containers NAMESPACE + '.container.ready', NAMESPACE + '.container.running', @@ -82,6 +83,12 @@ 'condition:OutOfDisk', 'condition:Ready', 'status:true', 'status:false', 'status:unknown', ], + NAMESPACE + '.pod.status_phase': [ + 'phase:Pending', 'phase:Running', + 'phase:Failed', 'phase:Succeeded', + 'phase:Unknown', 'namespace:default', + 'namespace:kube-system' + ], NAMESPACE + '.container.status_report.count.waiting': [ 'reason:CrashLoopBackoff', 'reason:ErrImagePull', @@ -217,6 +224,18 @@ def test_update_kube_state_metrics(aggregator, instance, check): aggregator.assert_metric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:unknown', 'optional:tag1'], value=0) + # Make sure we send counts for all phases to avoid no-data graphing issues + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Pending', 'optional:tag1'], value=1) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Running', 'optional:tag1'], value=3) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Succeeded', 'optional:tag1'], value=2) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Failed', 'optional:tag1'], value=2) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Unknown', 'optional:tag1'], value=1) + for metric in METRICS: aggregator.assert_metric(metric, hostname=HOSTNAMES.get(metric, None)) for tag in TAGS.get(metric, []):