From f2c177a1660f56251b7b8dabf1dd56b7c9b2b381 Mon Sep 17 00:00:00 2001 From: Xavier Vello Date: Mon, 19 Mar 2018 14:49:30 +0100 Subject: [PATCH] [kube_state] add kubernetes_state.nodes.by_condition count --- kubernetes_state/CHANGELOG.md | 7 +++++- .../kubernetes_state/__init__.py | 2 +- .../kubernetes_state/kubernetes_state.py | 22 ++++++++++++++----- kubernetes_state/manifest.json | 2 +- kubernetes_state/metadata.csv | 1 + .../test/test_kubernetes_state.py | 13 ++++++++++- 6 files changed, 37 insertions(+), 10 deletions(-) diff --git a/kubernetes_state/CHANGELOG.md b/kubernetes_state/CHANGELOG.md index 7e94ac8fd6b72..1fa3db15a2523 100644 --- a/kubernetes_state/CHANGELOG.md +++ b/kubernetes_state/CHANGELOG.md @@ -1,5 +1,10 @@ # CHANGELOG - kubernetes_state +2.4.0 / unrelease +================== + +* [IMPROVEMENT] Add kubernetes_state.nodes.by_condition count metric [#1277][] + 2.3.0 / 2018-02-28 ================== @@ -74,4 +79,4 @@ [#965]: https://github.com/DataDog/integrations-core/issues/965 [#1000]: https://github.com/DataDog/integrations-core/issues/1000 [#1040]: https://github.com/DataDog/integrations-core/issues/1040 -[#1137]: https://github.com/DataDog/integrations-core/issues/1137 \ No newline at end of file +[#1137]: https://github.com/DataDog/integrations-core/issues/1137 diff --git a/kubernetes_state/datadog_checks/kubernetes_state/__init__.py b/kubernetes_state/datadog_checks/kubernetes_state/__init__.py index 37919916d2b62..9b9cc63e90819 100644 --- a/kubernetes_state/datadog_checks/kubernetes_state/__init__.py +++ b/kubernetes_state/datadog_checks/kubernetes_state/__init__.py @@ -2,6 +2,6 @@ KubernetesState = kubernetes_state.KubernetesState -__version__ = "2.3.0" +__version__ = "2.4.0" __all__ = ['kubernetes_state'] diff --git a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py index 54c06bb3ddbb6..ed99492c45108 100644 --- a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py +++ b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py @@ -422,42 +422,52 @@ def kube_job_status_succeeded(self, message, **kwargs): self.job_succeeded_count[frozenset(tags)] += metric.gauge.value def kube_node_status_condition(self, message, **kwargs): - """ The ready status of a cluster node. >v1.0.0""" + """ The ready status of a cluster node. v1.0+""" base_check_name = self.NAMESPACE + '.node' + metric_name = self.NAMESPACE + '.nodes.by_condition' + for metric in message.metric: self._condition_to_tag_check(metric, base_check_name, self.condition_to_status_positive, tags=[self._label_to_tag("node", metric.label)]) + # Counts aggregated cluster-wide to avoid no-data issues on node churn, + # node granularity available in the service checks + tags = [ + self._label_to_tag("condition", metric.label), + self._label_to_tag("status", metric.label) + ] + self.count(metric_name, metric.gauge.value, tags) + def kube_node_status_ready(self, message, **kwargs): - """ The ready status of a cluster node.""" + """ The ready status of a cluster node (legacy)""" service_check_name = self.NAMESPACE + '.node.ready' for metric in message.metric: self._condition_to_service_check(metric, service_check_name, self.condition_to_status_positive, tags=[self._label_to_tag("node", metric.label)]) def kube_node_status_out_of_disk(self, message, **kwargs): - """ Whether the node is out of disk space. """ + """ Whether the node is out of disk space (legacy)""" service_check_name = self.NAMESPACE + '.node.out_of_disk' for metric in message.metric: self._condition_to_service_check(metric, service_check_name, self.condition_to_status_negative, tags=[self._label_to_tag("node", metric.label)]) def kube_node_status_memory_pressure(self, message, **kwargs): - """ Whether the node is in a memory pressure state. """ + """ Whether the node is in a memory pressure state (legacy)""" service_check_name = self.NAMESPACE + '.node.memory_pressure' for metric in message.metric: self._condition_to_service_check(metric, service_check_name, self.condition_to_status_negative, tags=[self._label_to_tag("node", metric.label)]) def kube_node_status_disk_pressure(self, message, **kwargs): - """ Whether the node is in a disk pressure state. """ + """ Whether the node is in a disk pressure state (legacy)""" service_check_name = self.NAMESPACE + '.node.disk_pressure' for metric in message.metric: self._condition_to_service_check(metric, service_check_name, self.condition_to_status_negative, tags=[self._label_to_tag("node", metric.label)]) def kube_node_status_network_unavailable(self, message, **kwargs): - """ Whether the node is in a network unavailable state. """ + """ Whether the node is in a network unavailable state (legacy)""" service_check_name = self.NAMESPACE + '.node.network_unavailable' for metric in message.metric: self._condition_to_service_check(metric, service_check_name, self.condition_to_status_negative, diff --git a/kubernetes_state/manifest.json b/kubernetes_state/manifest.json index 7e79f12316ed4..9ca80a717a0f9 100644 --- a/kubernetes_state/manifest.json +++ b/kubernetes_state/manifest.json @@ -11,7 +11,7 @@ "linux", "mac_os" ], - "version": "2.3.0", + "version": "2.4.0", "use_omnibus_reqs": true, "public_title": "Datadog-Kubernetes State Integration", "categories":["orchestration", "containers"], diff --git a/kubernetes_state/metadata.csv b/kubernetes_state/metadata.csv index 42f3c3b113368..8404d389b95fc 100644 --- a/kubernetes_state/metadata.csv +++ b/kubernetes_state/metadata.csv @@ -45,6 +45,7 @@ kubernetes_state.node.cpu_allocatable,gauge,,cpu,,The CPU resources of a node th kubernetes_state.node.memory_allocatable,gauge,,byte,,The memory resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.memory_allocatable kubernetes_state.node.pods_allocatable,gauge,,,,The pod resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.pods_allocatable kubernetes_state.node.status,gauge,,,,Submitted with a value of 1 for each node and tagged either 'status:schedulable' or 'status:unschedulable'; Sum this metric by either status to get the number of nodes in that status.,0,kubernetes,k8s_state.node.status +kubernetes_state.nodes.by_condition,count,,,,To sum by `condition` and `status` to get number of nodes in a given condition.,0,kubernetes,k8s_state.nodes.by_cond kubernetes_state.hpa.min_replicas,gauge,,,,Lower limit for the number of pods that can be set by the autoscaler,0,kubernetes,k8s_state.hpa.min_replicas kubernetes_state.hpa.max_replicas,gauge,,,,Upper limit for the number of pods that can be set by the autoscaler,0,kubernetes,k8s_state.hpa.max_replicas kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.target_cpu diff --git a/kubernetes_state/test/test_kubernetes_state.py b/kubernetes_state/test/test_kubernetes_state.py index 5bb8a5b49ec67..7f7dbc7b3f072 100644 --- a/kubernetes_state/test/test_kubernetes_state.py +++ b/kubernetes_state/test/test_kubernetes_state.py @@ -45,6 +45,7 @@ class TestKubernetesState(AgentCheckTest): NAMESPACE + '.node.pods_allocatable', NAMESPACE + '.node.gpu.cards_capacity', NAMESPACE + '.node.gpu.cards_allocatable', + NAMESPACE + '.nodes.by_condition', # deployments NAMESPACE + '.deployment.replicas', NAMESPACE + '.deployment.replicas_available', @@ -95,7 +96,12 @@ class TestKubernetesState(AgentCheckTest): TAGS = { NAMESPACE + '.pod.ready': ['node:minikube'], - NAMESPACE + '.pod.scheduled': ['node:minikube'] + NAMESPACE + '.pod.scheduled': ['node:minikube'], + NAMESPACE + '.nodes.by_condition': [ + 'condition:MemoryPressure', 'condition:DiskPressure', + 'condition:OutOfDisk', 'condition:Ready', + 'status:true', 'status:false', 'status:unknown', + ] } JOINED_METRICS = { @@ -160,6 +166,11 @@ def test__update_kube_state_metrics(self, mock_poll): self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.UNKNOWN, tags=['namespace:default', 'pod:hello-1509998460-tzh8k']) # Unknown + # Make sure we send counts for all statuses to avoid no-data graphing issues + self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:true'], value=1) + self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:false'], value=0) + self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:unknown'], value=0) + for metric in self.METRICS: self.assertMetric( metric,