Skip to content

Commit

Permalink
Merge pull request #1277 from DataDog/xvello/ksm-node-bycondition
Browse files Browse the repository at this point in the history
[kube_state] add kubernetes_state.nodes.by_condition count
  • Loading branch information
xvello authored Mar 19, 2018
2 parents 45a2145 + f2c177a commit 424f23f
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 10 deletions.
7 changes: 6 additions & 1 deletion kubernetes_state/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# CHANGELOG - kubernetes_state

2.4.0 / unrelease
==================

* [IMPROVEMENT] Add kubernetes_state.nodes.by_condition count metric [#1277][]

2.3.0 / 2018-02-28
==================

Expand Down Expand Up @@ -74,4 +79,4 @@
[#965]: https://github.com/DataDog/integrations-core/issues/965
[#1000]: https://github.com/DataDog/integrations-core/issues/1000
[#1040]: https://github.com/DataDog/integrations-core/issues/1040
[#1137]: https://github.com/DataDog/integrations-core/issues/1137
[#1137]: https://github.com/DataDog/integrations-core/issues/1137
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

KubernetesState = kubernetes_state.KubernetesState

__version__ = "2.3.0"
__version__ = "2.4.0"

__all__ = ['kubernetes_state']
Original file line number Diff line number Diff line change
Expand Up @@ -422,42 +422,52 @@ def kube_job_status_succeeded(self, message, **kwargs):
self.job_succeeded_count[frozenset(tags)] += metric.gauge.value

def kube_node_status_condition(self, message, **kwargs):
""" The ready status of a cluster node. >v1.0.0"""
""" The ready status of a cluster node. v1.0+"""
base_check_name = self.NAMESPACE + '.node'
metric_name = self.NAMESPACE + '.nodes.by_condition'

for metric in message.metric:
self._condition_to_tag_check(metric, base_check_name, self.condition_to_status_positive,
tags=[self._label_to_tag("node", metric.label)])

# Counts aggregated cluster-wide to avoid no-data issues on node churn,
# node granularity available in the service checks
tags = [
self._label_to_tag("condition", metric.label),
self._label_to_tag("status", metric.label)
]
self.count(metric_name, metric.gauge.value, tags)

def kube_node_status_ready(self, message, **kwargs):
""" The ready status of a cluster node."""
""" The ready status of a cluster node (legacy)"""
service_check_name = self.NAMESPACE + '.node.ready'
for metric in message.metric:
self._condition_to_service_check(metric, service_check_name, self.condition_to_status_positive,
tags=[self._label_to_tag("node", metric.label)])

def kube_node_status_out_of_disk(self, message, **kwargs):
""" Whether the node is out of disk space. """
""" Whether the node is out of disk space (legacy)"""
service_check_name = self.NAMESPACE + '.node.out_of_disk'
for metric in message.metric:
self._condition_to_service_check(metric, service_check_name, self.condition_to_status_negative,
tags=[self._label_to_tag("node", metric.label)])

def kube_node_status_memory_pressure(self, message, **kwargs):
""" Whether the node is in a memory pressure state. """
""" Whether the node is in a memory pressure state (legacy)"""
service_check_name = self.NAMESPACE + '.node.memory_pressure'
for metric in message.metric:
self._condition_to_service_check(metric, service_check_name, self.condition_to_status_negative,
tags=[self._label_to_tag("node", metric.label)])

def kube_node_status_disk_pressure(self, message, **kwargs):
""" Whether the node is in a disk pressure state. """
""" Whether the node is in a disk pressure state (legacy)"""
service_check_name = self.NAMESPACE + '.node.disk_pressure'
for metric in message.metric:
self._condition_to_service_check(metric, service_check_name, self.condition_to_status_negative,
tags=[self._label_to_tag("node", metric.label)])

def kube_node_status_network_unavailable(self, message, **kwargs):
""" Whether the node is in a network unavailable state. """
""" Whether the node is in a network unavailable state (legacy)"""
service_check_name = self.NAMESPACE + '.node.network_unavailable'
for metric in message.metric:
self._condition_to_service_check(metric, service_check_name, self.condition_to_status_negative,
Expand Down
2 changes: 1 addition & 1 deletion kubernetes_state/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"linux",
"mac_os"
],
"version": "2.3.0",
"version": "2.4.0",
"use_omnibus_reqs": true,
"public_title": "Datadog-Kubernetes State Integration",
"categories":["orchestration", "containers"],
Expand Down
1 change: 1 addition & 0 deletions kubernetes_state/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ kubernetes_state.node.cpu_allocatable,gauge,,cpu,,The CPU resources of a node th
kubernetes_state.node.memory_allocatable,gauge,,byte,,The memory resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.memory_allocatable
kubernetes_state.node.pods_allocatable,gauge,,,,The pod resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.pods_allocatable
kubernetes_state.node.status,gauge,,,,Submitted with a value of 1 for each node and tagged either 'status:schedulable' or 'status:unschedulable'; Sum this metric by either status to get the number of nodes in that status.,0,kubernetes,k8s_state.node.status
kubernetes_state.nodes.by_condition,count,,,,To sum by `condition` and `status` to get number of nodes in a given condition.,0,kubernetes,k8s_state.nodes.by_cond
kubernetes_state.hpa.min_replicas,gauge,,,,Lower limit for the number of pods that can be set by the autoscaler,0,kubernetes,k8s_state.hpa.min_replicas
kubernetes_state.hpa.max_replicas,gauge,,,,Upper limit for the number of pods that can be set by the autoscaler,0,kubernetes,k8s_state.hpa.max_replicas
kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.target_cpu
Expand Down
13 changes: 12 additions & 1 deletion kubernetes_state/test/test_kubernetes_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class TestKubernetesState(AgentCheckTest):
NAMESPACE + '.node.pods_allocatable',
NAMESPACE + '.node.gpu.cards_capacity',
NAMESPACE + '.node.gpu.cards_allocatable',
NAMESPACE + '.nodes.by_condition',
# deployments
NAMESPACE + '.deployment.replicas',
NAMESPACE + '.deployment.replicas_available',
Expand Down Expand Up @@ -95,7 +96,12 @@ class TestKubernetesState(AgentCheckTest):

TAGS = {
NAMESPACE + '.pod.ready': ['node:minikube'],
NAMESPACE + '.pod.scheduled': ['node:minikube']
NAMESPACE + '.pod.scheduled': ['node:minikube'],
NAMESPACE + '.nodes.by_condition': [
'condition:MemoryPressure', 'condition:DiskPressure',
'condition:OutOfDisk', 'condition:Ready',
'status:true', 'status:false', 'status:unknown',
]
}

JOINED_METRICS = {
Expand Down Expand Up @@ -160,6 +166,11 @@ def test__update_kube_state_metrics(self, mock_poll):
self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.UNKNOWN,
tags=['namespace:default', 'pod:hello-1509998460-tzh8k']) # Unknown

# Make sure we send counts for all statuses to avoid no-data graphing issues
self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:true'], value=1)
self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:false'], value=0)
self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:unknown'], value=0)

for metric in self.METRICS:
self.assertMetric(
metric,
Expand Down

0 comments on commit 424f23f

Please sign in to comment.