Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sister gauge metrics to kubernetes_state pod service checks #1578

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import re
import time
from collections import defaultdict
from collections import defaultdict, Counter

try:
# Agent5 compatibility layer
Expand Down Expand Up @@ -342,16 +342,32 @@ def _trim_job_tag(self, name):
# Labels attached: namespace, pod
# As a message the phase=Pending|Running|Succeeded|Failed|Unknown
# From the phase the check will update its status
# Also submits as an aggregated count with minimal tags so it is
# visualisable over time per namespace and phase
def kube_pod_status_phase(self, message, **kwargs):
""" Phase a pod is in. """
metric_name = self.NAMESPACE + '.pod.status_phase'
# Will submit a service check which status is given by its phase.
# More details about the phase in the message of the check.
check_basename = self.NAMESPACE + '.pod.phase'
status_phase_counter = Counter()

for metric in message.metric:
self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status,
tags=[self._label_to_tag("pod", metric.label),
self._label_to_tag("namespace", metric.label)] + self.custom_tags)

# Counts aggregated cluster-wide to avoid no-data issues on pod churn,
# pod granularity available in the service checks
tags = [
self._label_to_tag("namespace", metric.label),
self._label_to_tag("phase", metric.label)
] + self.custom_tags
status_phase_counter[tuple(sorted(tags))] += metric.gauge.value

for tags, count in status_phase_counter.iteritems():
self.gauge(metric_name, count, tags=list(tags))

def kube_pod_container_status_waiting_reason(self, message, **kwargs):
metric_name = self.NAMESPACE + '.container.status_report.count.waiting'
for metric in message.metric:
Expand Down
1 change: 1 addition & 0 deletions kubernetes_state/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed b
kubernetes_state.hpa.desired_replicas,gauge,,,,Desired number of replicas of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.desired_replicas
kubernetes_state.pod.ready,gauge,,,,"In association with the `condition` tag, whether the pod is ready to serve requests, e.g. `condition:true` keeps the pods that are in a ready state",1,kubernetes,k8s_state.pod.ready
kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled
kubernetes_state.pod.status_phase,gauge,,,,"To sum by `phase` to get number of pods in a given phase, and `namespace` to break this down by namespace",0,kubernetes,k8s_state.pod.status_phase
kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas
kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled
kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy
Expand Down
19 changes: 19 additions & 0 deletions kubernetes_state/tests/test_kubernetes_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
# pods
NAMESPACE + '.pod.ready',
NAMESPACE + '.pod.scheduled',
NAMESPACE + '.pod.status_phase',
# containers
NAMESPACE + '.container.ready',
NAMESPACE + '.container.running',
Expand Down Expand Up @@ -82,6 +83,12 @@
'condition:OutOfDisk', 'condition:Ready',
'status:true', 'status:false', 'status:unknown',
],
NAMESPACE + '.pod.status_phase': [
'phase:Pending', 'phase:Running',
'phase:Failed', 'phase:Succeeded',
'phase:Unknown', 'namespace:default',
'namespace:kube-system'
],
NAMESPACE + '.container.status_report.count.waiting': [
'reason:CrashLoopBackoff',
'reason:ErrImagePull',
Expand Down Expand Up @@ -217,6 +224,18 @@ def test_update_kube_state_metrics(aggregator, instance, check):
aggregator.assert_metric(NAMESPACE + '.nodes.by_condition',
tags=['condition:Ready', 'status:unknown', 'optional:tag1'], value=0)

# Make sure we send counts for all phases to avoid no-data graphing issues
aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
tags=['namespace:default', 'phase:Pending', 'optional:tag1'], value=1)
aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
tags=['namespace:default', 'phase:Running', 'optional:tag1'], value=3)
aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
tags=['namespace:default', 'phase:Succeeded', 'optional:tag1'], value=2)
aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
tags=['namespace:default', 'phase:Failed', 'optional:tag1'], value=2)
aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
tags=['namespace:default', 'phase:Unknown', 'optional:tag1'], value=1)

for metric in METRICS:
aggregator.assert_metric(metric, hostname=HOSTNAMES.get(metric, None))
for tag in TAGS.get(metric, []):
Expand Down