Skip to content

Commit

Permalink
Add support for kube_scheduler SLI metrics (#15731)
Browse files Browse the repository at this point in the history
* Create scraper for SLI metrics

* Add sli metrics test fixture

* Capture sli kubernetes_healthcheck and healthchecks_total metrics

* Add unit tests for detecting sli endpoint logic

* Remove unused urljoin helper function

* Add new metrics to metadata

* Add changelog entry

* Fix logic for checking if sli endpoint is available

* Add check to prevent querying metrics if url unavailable

* Use general instance fixture

* Use get_here helper function to get current directory

* Use format instead of f string for python 2 support

* Remove custom tag from default test instance fixture
  • Loading branch information
jennchenn authored Sep 8, 2023
1 parent d64642e commit 1d70f94
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 1 deletion.
1 change: 1 addition & 0 deletions kube_scheduler/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
***Added***:

* Add goroutine_by_scheduling_operation metric to kube_scheduler check ([#15697](https://github.com/DataDog/integrations-core/pull/15697))
* Add support for kube_scheduler SLI metrics ([#15731](https://github.com/DataDog/integrations-core/pull/15731))

## 4.6.1 / 2023-08-18

Expand Down
14 changes: 13 additions & 1 deletion kube_scheduler/datadog_checks/kube_scheduler/kube_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from datadog_checks.base.config import is_affirmative
from datadog_checks.base.utils.http import RequestsWrapper

from .sli_metrics import SliMetricsScraperMixin

DEFAULT_COUNTERS = {
# Number of HTTP requests, partitioned by status code, method, and host.
'rest_client_requests_total': 'client.http.requests',
Expand Down Expand Up @@ -131,7 +133,7 @@
]


class KubeSchedulerCheck(KubeLeaderElectionMixin, OpenMetricsBaseCheck):
class KubeSchedulerCheck(KubeLeaderElectionMixin, SliMetricsScraperMixin, OpenMetricsBaseCheck):
DEFAULT_METRIC_LIMIT = 0

KUBE_SCHEDULER_NAMESPACE = "kube_scheduler"
Expand Down Expand Up @@ -183,9 +185,15 @@ def __init__(self, name, init_config, instances):

instance['health_url'] = url

inst = instances[0] if instances else None
slis_instance = self.create_sli_prometheus_instance(inst)
self.slis_scraper_config = self.get_scraper_config(slis_instance)
self.detect_sli_endpoint(self.get_http_handler(self.slis_scraper_config), slis_instance.get('prometheus_url'))

def check(self, instance):
# Get the configuration for this specific instance
scraper_config = self.get_scraper_config(instance)

# Set up metric_transformers
transformers = {}
for metric_from, metric_to in TRANSFORM_VALUE_HISTOGRAMS.items():
Expand All @@ -203,6 +211,10 @@ def check(self, instance):

self._perform_service_check(instance)

if self._slis_available:
self.log.debug('processing kube scheduler sli metrics')
self.process(self.slis_scraper_config)

def _perform_service_check(self, instance):
url = instance.get('health_url')
if url is None:
Expand Down
64 changes: 64 additions & 0 deletions kube_scheduler/datadog_checks/kube_scheduler/sli_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# (C) Datadog, Inc. 2023-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)
from __future__ import division

from copy import deepcopy

SLI_METRICS_PATH = '/slis'

SLI_GAUGES = {
'kubernetes_healthcheck': 'kubernetes_healthcheck',
}

SLI_COUNTERS = {
'kubernetes_healthchecks_total': 'kubernetes_healthchecks_total',
}


class SliMetricsScraperMixin(object):
"""
This class scrapes metrics for the kube scheduler "/metrics/sli" prometheus endpoint and submits
them on behalf of a check.
"""

def __init__(self, *args, **kwargs):
super(SliMetricsScraperMixin, self).__init__(*args, **kwargs)
self._slis_available = None

def create_sli_prometheus_instance(self, instance):
"""
Create a copy of the instance and set default values.
This is so the base class can create a scraper_config with the proper values.
"""
KUBE_SCHEDULER_SLI_NAMESPACE = "kube_scheduler.slis"

sli_instance = deepcopy(instance)
sli_instance.update(
{
'namespace': KUBE_SCHEDULER_SLI_NAMESPACE,
'prometheus_url': instance.get('prometheus_url') + SLI_METRICS_PATH,
'metrics': [SLI_GAUGES, SLI_COUNTERS],
}
)
return sli_instance

def detect_sli_endpoint(self, http_handler, url):
"""
Whether the sli metrics endpoint is available (k8s 1.26+).
:return: false if the endpoint throws a 404 or 403, true otherwise.
"""
if self._slis_available is not None:
return self._slis_available
try:
r = http_handler.head(url)
except Exception as e:
self.log.debug("Error querying SLIs endpoint: %s", e)
return False
if r.status_code == 403:
self.log.debug(
"The /metrics/slis endpoint was introduced in Kubernetes v1.26. If you expect to see SLI metrics, \
please check that your permissions are configured properly."
)
self._slis_available = r.status_code != 404 and r.status_code != 403
return self._slis_available
5 changes: 5 additions & 0 deletions kube_scheduler/hatch.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,10 @@ base-package-features = [
[[envs.default.matrix]]
python = ["2.7", "3.9"]

[envs.default]
dependencies = [
"requests-mock==1.4.0",
]

[envs.default.env-vars]
DDEV_SKIP_GENERIC_TAGS_CHECK = "true"
2 changes: 2 additions & 0 deletions kube_scheduler/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,7 @@ kube_scheduler.scheduling.pod.scheduling_duration.sum,gauge,,,second,"Total e2e
kube_scheduler.scheduling.pod.scheduling_duration.count,gauge,,,,"E2e latency for a pod being scheduled which may include multiple scheduling attempts (requires k8s v1.23+)",0,kube_scheduler,scheduling.pod.scheduling_duration.count,
kube_scheduler.scheduling.attempt_duration.sum,gauge,,,second,"Total scheduling attempt latency in seconds (scheduling algorithm + binding) (requires k8s v1.23+)",0,kube_scheduler,scheduling.attempt_duration.sum,
kube_scheduler.scheduling.attempt_duration.count,gauge,,,,"Scheduling attempt latency in seconds (scheduling algorithm + binding) (requires k8s v1.23+)",0,kube_scheduler,scheduling.attempt_duration.count,
kube_scheduler.slis.kubernetes_healthcheck,gauge,,,,"Result of a single scheduler healthcheck (alpha; requires k8s v1.26+)",0,kube_scheduler,slis.kubernetes_healthcheck,
kube_scheduler.slis.kubernetes_healthcheck_total,count,,,,"Cumulative results of all scheduler healthchecks (alpha; requires k8s v1.26+)",0,kube_scheduler,slis.kubernetes_healthcheck_total,
kube_scheduler.pending_pods,gauge,,,,"Number of pending pods, by the queue type (requires k8s v1.15+)",0,kube_scheduler,pending_pods,
kube_scheduler.queue.incoming_pods,count,,,,"Number of pods added to scheduling queues by event and queue type (requires k8s v1.17+)",0,kube_scheduler,queue.incoming_pods,
6 changes: 6 additions & 0 deletions kube_scheduler/tests/fixtures/metrics_slis_1.27.3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# HELP kubernetes_healthcheck [ALPHA] This metric records the result of a single healthcheck.
# TYPE kubernetes_healthcheck gauge
kubernetes_healthcheck{name="ping",type="healthz"} 1
# HELP kubernetes_healthchecks_total [ALPHA] This metric records the results of all healthcheck.
# TYPE kubernetes_healthchecks_total counter
kubernetes_healthchecks_total{name="ping",status="success",type="healthz"} 2450
88 changes: 88 additions & 0 deletions kube_scheduler/tests/test_sli_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# (C) Datadog, Inc. 2023-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

import os

import mock
import pytest
import requests
import requests_mock

from datadog_checks.kube_scheduler import KubeSchedulerCheck

from .common import HERE

# Constants
CHECK_NAME = 'kube_scheduler'


@pytest.fixture()
def mock_metrics():
f_name = os.path.join(HERE, 'fixtures', 'metrics_slis_1.27.3.txt')
with open(f_name, 'r') as f:
text_data = f.read()
with mock.patch(
'requests.get',
return_value=mock.MagicMock(
status_code=200, iter_lines=lambda **kwargs: text_data.split("\n"), headers={'Content-Type': "text/plain"}
),
):
yield


def test_check_metrics_slis(aggregator, mock_metrics, mock_request, instance):
mock_request.head('http://localhost:10251/metrics/slis', status_code=200)
c = KubeSchedulerCheck(CHECK_NAME, {}, [instance])
c.check(instance)

def assert_metric(name, **kwargs):
# Wrapper to keep assertions < 120 chars
aggregator.assert_metric("{}.{}".format(CHECK_NAME, name), **kwargs)

assert_metric('slis.kubernetes_healthcheck', value=1, tags=['name:ping', 'type:healthz'])
assert_metric(
'slis.kubernetes_healthchecks_total',
value=2450,
tags=['name:ping', 'status:success', 'type:healthz'],
)

aggregator.assert_all_metrics_covered()


@pytest.fixture()
def mock_request():
with requests_mock.Mocker() as m:
yield m


def test_detect_sli_endpoint(mock_metrics, mock_request, instance):
mock_request.head('http://localhost:10251/metrics/slis', status_code=200)
c = KubeSchedulerCheck(CHECK_NAME, {}, [instance])
c.check(instance)
assert c._slis_available is True
assert mock_request.call_count == 1


def test_detect_sli_endpoint_404(mock_metrics, mock_request, instance):
mock_request.head('http://localhost:10251/metrics/slis', status_code=404)
c = KubeSchedulerCheck(CHECK_NAME, {}, [instance])
c.check(instance)
assert c._slis_available is False
assert mock_request.call_count == 1


def test_detect_sli_endpoint_403(mock_metrics, mock_request, instance):
mock_request.head('http://localhost:10251/metrics/slis', status_code=403)
c = KubeSchedulerCheck(CHECK_NAME, {}, [instance])
c.check(instance)
assert c._slis_available is False
assert mock_request.call_count == 1


def test_detect_sli_endpoint_timeout(mock_metrics, mock_request, instance):
mock_request.head('http://localhost:10251/metrics/slis', exc=requests.exceptions.ConnectTimeout)
c = KubeSchedulerCheck(CHECK_NAME, {}, [instance])
c.check(instance)
assert c._slis_available is None
assert mock_request.call_count == 1

0 comments on commit 1d70f94

Please sign in to comment.