DataDog · hkaj · Jul 5, 2018 · May 18, 2018 · May 22, 2018 · Jun 4, 2018
@@ -4,7 +4,7 @@
 
 import re
 import time
-from collections import defaultdict
+from collections import defaultdict, Counter
 
 try:
     # Agent5 compatibility layer
@@ -342,16 +342,32 @@ def _trim_job_tag(self, name):
     # Labels attached: namespace, pod
     # As a message the phase=Pending|Running|Succeeded|Failed|Unknown
     # From the phase the check will update its status
+    # Also submits as an aggregated count with minimal tags so it is
+    # visualisable over time per namespace and phase
     def kube_pod_status_phase(self, message, **kwargs):
         """ Phase a pod is in. """
+        metric_name = self.NAMESPACE + '.pod.status_phase'
         # Will submit a service check which status is given by its phase.
         # More details about the phase in the message of the check.
         check_basename = self.NAMESPACE + '.pod.phase'
+        status_phase_counter = Counter()
+
         for metric in message.metric:
             self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status,
                                          tags=[self._label_to_tag("pod", metric.label),
                                                self._label_to_tag("namespace", metric.label)] + self.custom_tags)
 
+            # Counts aggregated cluster-wide to avoid no-data issues on pod churn,
+            # pod granularity available in the service checks
+            tags = [
+                self._label_to_tag("namespace", metric.label),
+                self._label_to_tag("phase", metric.label)
+            ] + self.custom_tags
+            status_phase_counter[tuple(sorted(tags))] += metric.gauge.value
+
+        for tags, count in status_phase_counter.iteritems():
+            self.gauge(metric_name, count, tags=list(tags))
+
     def kube_pod_container_status_waiting_reason(self, message, **kwargs):
         metric_name = self.NAMESPACE + '.container.status_report.count.waiting'
         for metric in message.metric:

@@ -52,6 +52,7 @@ kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed b
 kubernetes_state.hpa.desired_replicas,gauge,,,,Desired number of replicas of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.desired_replicas
 kubernetes_state.pod.ready,gauge,,,,"In association with the `condition` tag, whether the pod is ready to serve requests, e.g. `condition:true` keeps the pods that are in a ready state",1,kubernetes,k8s_state.pod.ready
 kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled
+kubernetes_state.pod.status_phase,gauge,,,,"To sum by `phase` to get number of pods in a given phase, and `namespace` to break this down by namespace",0,kubernetes,k8s_state.pod.status_phase
 kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas
 kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled
 kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy

@@ -46,6 +46,7 @@
     # pods
     NAMESPACE + '.pod.ready',
     NAMESPACE + '.pod.scheduled',
+    NAMESPACE + '.pod.status_phase',
     # containers
     NAMESPACE + '.container.ready',
     NAMESPACE + '.container.running',
@@ -82,6 +83,12 @@
         'condition:OutOfDisk', 'condition:Ready',
         'status:true', 'status:false', 'status:unknown',
     ],
+    NAMESPACE + '.pod.status_phase': [
+        'phase:Pending', 'phase:Running',
+        'phase:Failed', 'phase:Succeeded',
+        'phase:Unknown', 'namespace:default',
+        'namespace:kube-system'
+    ],
     NAMESPACE + '.container.status_report.count.waiting': [
         'reason:CrashLoopBackoff',
         'reason:ErrImagePull',
@@ -217,6 +224,18 @@ def test_update_kube_state_metrics(aggregator, instance, check):
     aggregator.assert_metric(NAMESPACE + '.nodes.by_condition',
                              tags=['condition:Ready', 'status:unknown', 'optional:tag1'], value=0)
 
+    # Make sure we send counts for all phases to avoid no-data graphing issues
+    aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
+                             tags=['namespace:default', 'phase:Pending', 'optional:tag1'], value=1)
+    aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
+                             tags=['namespace:default', 'phase:Running', 'optional:tag1'], value=3)
+    aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
+                             tags=['namespace:default', 'phase:Succeeded', 'optional:tag1'], value=2)
+    aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
+                             tags=['namespace:default', 'phase:Failed', 'optional:tag1'], value=2)
+    aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
+                             tags=['namespace:default', 'phase:Unknown', 'optional:tag1'], value=1)
+
     for metric in METRICS:
         aggregator.assert_metric(metric, hostname=HOSTNAMES.get(metric, None))
         for tag in TAGS.get(metric, []):