Skip to content

Commit

Permalink
rebasing on [#801] and updating prometheus.txt
Browse files Browse the repository at this point in the history
  • Loading branch information
charlyF committed Nov 14, 2017
1 parent da26a02 commit e158e97
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 15 deletions.
3 changes: 2 additions & 1 deletion kubernetes_state/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

1.4.0 / Unreleased
==================

### Changes

* [UPDATE] Update auto_conf template to support agent 6 and 5.20+. See [#860][]
* [FEATURE] Adding HPA metrics. See [#801][]
* [FEATURE] Add metrics for GPU, PVC, CronJobs and other added in kubernetes_state 1.1.0. See [#853][]

1.3.0 / 2017-08-28
Expand Down
2 changes: 1 addition & 1 deletion kubernetes_state/auto_conf.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
docker_images:
ad_identifiers:
- kube-state-metrics

init_config:
Expand Down
52 changes: 51 additions & 1 deletion kubernetes_state/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ def __init__(self, name, init_config, agentConfig, instances=None):
'kube_deployment_status_replicas_available': 'deployment.replicas_available',
'kube_deployment_status_replicas_unavailable': 'deployment.replicas_unavailable',
'kube_deployment_status_replicas_updated': 'deployment.replicas_updated',
'kube_hpa_spec_min_replicas': 'hpa.min_replicas',
'kube_hpa_spec_max_replicas': 'hpa.max_replicas',
'kube_hpa_status_desired_replicas': 'hpa.desired_replicas',
'kube_hpa_status_current_replicas': 'hpa.current_replicas',
'kube_node_status_allocatable_cpu_cores': 'node.cpu_allocatable',
'kube_node_status_allocatable_memory_bytes': 'node.memory_allocatable',
'kube_node_status_allocatable_pods': 'node.pods_allocatable',
Expand Down Expand Up @@ -128,6 +132,7 @@ def __init__(self, name, init_config, agentConfig, instances=None):
'kube_replicationcontroller_status_observed_generation',
'kube_statefulset_metadata_generation',
'kube_statefulset_status_observed_generation',
'kube_hpa_metadata_generation',
# kube_node_status_phase and kube_namespace_status_phase have no use case as a service check
'kube_namespace_status_phase',
'kube_node_status_phase',
Expand Down Expand Up @@ -200,6 +205,44 @@ def _condition_to_service_check(self, metric, sc_name, mapping, tags=None):
else:
self.log.debug("Unable to handle %s - unknown condition %s" % (sc_name, label.value))

def _condition_to_tag_check(self, metric, base_sc_name, mapping, tags=None):
"""
Metrics from kube-state-metrics have changed
For example:
kube_node_status_condition{condition="Ready",node="ip-172-33-39-189.eu-west-1.compute.internal",status="true"} 1
kube_node_status_condition{condition="OutOfDisk",node="ip-172-33-57-130.eu-west-1.compute.internal",status="false"} 1
metric {
label { name: "condition", value: "true"
}
# other labels here
gauge { value: 1.0 }
}
This function evaluates metrics containing conditions and sends a service check
based on a provided condition->check mapping dict
"""
if bool(metric.gauge.value) is False:
return # Ignore if gauge is not 1
label_value, condition_map = self._get_metric_condition_map(base_sc_name, metric.label)
service_check_name = condition_map['service_check_name']
mapping = condition_map['mapping']
if condition_map['service_check_name'] is None:
self.log.debug("Unable to handle %s - unknown condition %s" % (service_check_name, label_value))
else:
self.service_check(service_check_name, mapping[label_value], tags=tags)
self.log.debug("%s %s %s" % (service_check_name, mapping[label_value], tags))

def _get_metric_condition_map(self, base_sc_name, labels):
switch = {
'Ready': {'service_check_name': base_sc_name + '.ready', 'mapping': self.condition_to_status_positive},
'OutOfDisk': {'service_check_name': base_sc_name + '.out_of_disk', 'mapping': self.condition_to_status_negative},
'DiskPressure': {'service_check_name': base_sc_name + '.disk_pressure', 'mapping': self.condition_to_status_negative},
'NetworkUnavailable': {'service_check_name': base_sc_name + '.network_unavailable', 'mapping': self.condition_to_status_negative},
'MemoryPressure': {'service_check_name': base_sc_name + '.memory_pressure', 'mapping': self.condition_to_status_negative}
}
label_value = self._extract_label_value('status', labels)
return label_value, switch.get(self._extract_label_value('condition', labels), {'service_check_name': None, 'mapping': None})

def _extract_label_value(self, name, labels):
"""
Search for `name` in labels name and returns
Expand Down Expand Up @@ -316,8 +359,15 @@ def kube_job_status_succeeded(self, message, **kwargs):
tags.append(self._format_tag(label.name, label.value))
self.job_succeeded_count[frozenset(tags)] += metric.gauge.value

def kube_node_status_condition(self, message, **kwargs):
""" The ready status of a cluster node. >v1.0.0"""
base_check_name = self.NAMESPACE + '.node'
for metric in message.metric:
self._condition_to_tag_check(metric, base_check_name, self.condition_to_status_positive,
tags=[self._label_to_tag("node", metric.label)])

def kube_node_status_ready(self, message, **kwargs):
""" The ready status of a cluster node. """
""" The ready status of a cluster node."""
service_check_name = self.NAMESPACE + '.node.ready'
for metric in message.metric:
self._condition_to_service_check(metric, service_check_name, self.condition_to_status_positive,
Expand Down
18 changes: 18 additions & 0 deletions kubernetes_state/ci/fixtures/prometheus/prometheus.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# HELP kube_hpa_metadata_generation The generation observed by the HorizontalPodAutoscaler controller.
# TYPE kube_hpa_metadata_generation gauge
kube_hpa_metadata_generation{hpa="hpa1",namespace="ns1"} 2
# HELP kube_hpa_spec_max_replicas Upper limit for the number of pods that can be set by the autoscaler; cannot be smaller than MinReplicas.
# TYPE kube_hpa_spec_max_replicas gauge
kube_hpa_spec_max_replicas{hpa="hpa1",namespace="ns1"} 4
# HELP kube_hpa_spec_min_replicas Lower limit for the number of pods that can be set by the autoscaler, default 1.
# TYPE kube_hpa_spec_min_replicas gauge
kube_hpa_spec_min_replicas{hpa="hpa1",namespace="ns1"} 2
# HELP kube_hpa_status_current_replicas Current number of replicas of pods managed by this autoscaler.
# TYPE kube_hpa_status_current_replicas gauge
kube_hpa_status_current_replicas{hpa="hpa1",namespace="ns1"} 2
# HELP kube_hpa_status_desired_replicas Desired number of replicas of pods managed by this autoscaler.
# TYPE kube_hpa_status_desired_replicas gauge
kube_hpa_status_desired_replicas{hpa="hpa1",namespace="ns1"} 2
# HELP kube_cronjob_created Unix creation timestamp
# TYPE kube_cronjob_created gauge
kube_cronjob_created{cronjob="hello",namespace="default"} 1.509978394e+09
Expand Down Expand Up @@ -228,6 +243,9 @@ kube_node_status_condition{condition="OutOfDisk",node="minikube",status="unknown
kube_node_status_condition{condition="Ready",node="minikube",status="false"} 0
kube_node_status_condition{condition="Ready",node="minikube",status="true"} 1
kube_node_status_condition{condition="Ready",node="minikube",status="unknown"} 0
# HELP kube_node_status_network_unavailable Whether the network is correctly configured for the node.
# TYPE kube_node_status_network_unavailable gauge
kube_node_status_network_unavailable{node="127.0.0.1",condition="false"} 1
# HELP kube_persistentvolumeclaim_info Information about persistent volume claim.
# TYPE kube_persistentvolumeclaim_info gauge
kube_persistentvolumeclaim_info{namespace="default",persistentvolumeclaim="task-pv-claim",storageclass="manual"} 1
Expand Down
4 changes: 2 additions & 2 deletions kubernetes_state/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"maintainer": "[email protected]",
"manifest_version": "0.1.0",
"max_agent_version": "6.0.0",
"min_agent_version": "5.15.0",
"min_agent_version": "5.20.0",
"name": "kubernetes_state",
"short_description": "Capture Pod scheduling events, track the status of your Kubelets, and more.",
"guid": "fa0e4395-3eae-4df8-88f2-9d7075c21a2d",
Expand All @@ -11,7 +11,7 @@
"linux",
"mac_os"
],
"version": "1.4.0",
"version": "1.5.0",
"use_omnibus_reqs": true,
"public_title": "Datadog-Kubernetes State Integration",
"categories":["orchestration", "containers"],
Expand Down
4 changes: 4 additions & 0 deletions kubernetes_state/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ kubernetes_state.node.cpu_allocatable,gauge,,cpu,,The CPU resources of a node th
kubernetes_state.node.memory_allocatable,gauge,,byte,,The memory resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.memory_allocatable
kubernetes_state.node.pods_allocatable,gauge,,,,The pod resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.pods_allocatable
kubernetes_state.node.status,gauge,,,,Submitted with a value of 1 for each node and tagged either 'status:schedulable' or 'status:unschedulable'; Sum this metric by either status to get the number of nodes in that status.,0,kubernetes,k8s_state.node.status
kubernetes_state.hpa.min_replicas,gauge,,,,Lower limit for the number of pods that can be set by the autoscaler,0,kubernetes,k8s_state.hpa.min_replicas
kubernetes_state.hpa.max_replicas,gauge,,,,Upper limit for the number of pods that can be set by the autoscaler,0,kubernetes,k8s_state.hpa.max_replicas
kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.target_cpu
kubernetes_state.hpa.desired_replicas,gauge,,,,Desired number of replicas of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.desired_replicas
kubernetes_state.pod.ready,gauge,,,,Whether the pod is ready to serve requests,1,kubernetes,k8s_state.pod.ready
kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled
kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas
Expand Down
55 changes: 45 additions & 10 deletions kubernetes_state/test_kubernetes_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
# stdlib
import mock
import os
from nose.plugins.attrib import attr

# project
from tests.checks.common import AgentCheckTest

NAMESPACE = 'kubernetes_state'

@attr(requires='kubernetes_state')
class TestKubernetesState(AgentCheckTest):

CHECK_NAME = 'kubernetes_state'
Expand All @@ -37,6 +38,11 @@ class TestKubernetesState(AgentCheckTest):
NAMESPACE + '.daemonset.scheduled',
NAMESPACE + '.daemonset.misscheduled',
NAMESPACE + '.daemonset.desired',
# hpa
NAMESPACE + '.hpa.min_replicas',
NAMESPACE + '.hpa.max_replicas',
NAMESPACE + '.hpa.desired_replicas',
NAMESPACE + '.hpa.current_replicas',
# pods
NAMESPACE + '.pod.ready',
NAMESPACE + '.pod.scheduled',
Expand Down Expand Up @@ -92,15 +98,17 @@ def test__update_kube_state_metrics(self, mock_poll):
}

self.run_check(config)
# Removed in https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v100-rc1--2017-08-02
# self.assertServiceCheck(NAMESPACE + '.node.ready', self.check.OK)
# self.assertServiceCheck(NAMESPACE + '.node.out_of_disk', self.check.OK)
# self.assertServiceCheck(NAMESPACE + '.pod.phase.running', self.check.OK)
# self.assertServiceCheck(NAMESPACE + '.pod.phase.pending', self.check.WARNING)
# TODO: uncomment when any of these are in the test protobuf.bin
# self.assertServiceCheck(NAMESPACE + '.pod.phase.succeeded', self.check.OK)
# self.assertServiceCheck(NAMESPACE + '.pod.phase.failed', self.check.CRITICAL)
# self.assertServiceCheck(NAMESPACE + '.pod.phase.unknown', self.check.UNKNOWN)

self.assertServiceCheck(NAMESPACE + '.node.ready', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.node.out_of_disk', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.node.memory_pressure', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.node.network_unavailable', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.node.disk_pressure', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.pod.phase.running', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.pod.phase.pending', self.check.WARNING)
self.assertServiceCheck(NAMESPACE + '.pod.phase.succeeded', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.pod.phase.failed', self.check.CRITICAL)
self.assertServiceCheck(NAMESPACE + '.pod.phase.unknown', self.check.UNKNOWN)

for metric in self.METRICS:
self.assertMetric(metric)
Expand All @@ -109,6 +117,33 @@ def test__update_kube_state_metrics(self, mock_poll):

self.assert_resourcequota()

@mock.patch('checks.prometheus_check.PrometheusCheck.poll')
def test__update_kube_state_metrics_v040(self, mock_poll):
f_name = os.path.join(os.path.dirname(__file__), 'ci', 'fixtures', 'prometheus', 'prometheus.txt')
with open(f_name, 'rb') as f:
mock_poll.return_value = ('text/plain', f.read())

config = {
'instances': [{
'host': 'foo',
'kube_state_url': 'http://foo',
}]
}

self.run_check(config)

self.assertServiceCheck(NAMESPACE + '.node.ready', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.node.out_of_disk', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.pod.phase.running', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.pod.phase.pending', self.check.WARNING)


for metric in self.METRICS:
if not metric.startswith(NAMESPACE + '.hpa'):
self.assertMetric(metric)

self.assert_resourcequota()

def assert_resourcequota(self):
""" The metric name is created dynamically so we just check some exist. """
for m in self.metrics:
Expand Down

0 comments on commit e158e97

Please sign in to comment.