From 426d979ee9ba40fe4cb65a28067a4d4fdcf022c1 Mon Sep 17 00:00:00 2001 From: Adel Haj Hassan <41540817+adel121@users.noreply.github.com> Date: Mon, 9 Dec 2024 19:49:12 +0100 Subject: [PATCH] [CONTP-521] use a hybrid health check for wlm kubeapiserver collector (#31876) --- .../core/healthprobe/impl/healthprobe_test.go | 3 ++- .../internal/kubeapiserver/kubeapiserver.go | 2 +- pkg/status/health/global.go | 16 +++++++------ pkg/status/health/health.go | 23 ++++++++----------- pkg/status/health/health_test.go | 6 ++--- pkg/status/health/options.go | 15 ++++++++++++ 6 files changed, 40 insertions(+), 25 deletions(-) create mode 100644 pkg/status/health/options.go diff --git a/comp/core/healthprobe/impl/healthprobe_test.go b/comp/core/healthprobe/impl/healthprobe_test.go index 0dc9b5d271bad..7774e42043765 100644 --- a/comp/core/healthprobe/impl/healthprobe_test.go +++ b/comp/core/healthprobe/impl/healthprobe_test.go @@ -13,11 +13,12 @@ import ( "net/http/httptest" "testing" + "github.com/stretchr/testify/assert" + healthprobeComponent "github.com/DataDog/datadog-agent/comp/core/healthprobe/def" logmock "github.com/DataDog/datadog-agent/comp/core/log/mock" compdef "github.com/DataDog/datadog-agent/comp/def" "github.com/DataDog/datadog-agent/pkg/status/health" - "github.com/stretchr/testify/assert" ) func TestServer(t *testing.T) { diff --git a/comp/core/workloadmeta/collectors/internal/kubeapiserver/kubeapiserver.go b/comp/core/workloadmeta/collectors/internal/kubeapiserver/kubeapiserver.go index 0666db1755b75..7def82fc21d9b 100644 --- a/comp/core/workloadmeta/collectors/internal/kubeapiserver/kubeapiserver.go +++ b/comp/core/workloadmeta/collectors/internal/kubeapiserver/kubeapiserver.go @@ -224,7 +224,7 @@ func runStartupCheck(ctx context.Context, stores []*reflectorStore) { // There is no way to ensure liveness correctly as it would need to be plugged inside the // inner loop of Reflector. // However, we add Startup when we got at least some data. - startupHealthCheck := health.RegisterStartup(componentName) + startupHealthCheck := health.RegisterReadiness(componentName, health.Once) // Checked synced, in its own scope to cleanly un-reference the syncTimer { diff --git a/pkg/status/health/global.go b/pkg/status/health/global.go index 32af425cbe111..9f6c031f22128 100644 --- a/pkg/status/health/global.go +++ b/pkg/status/health/global.go @@ -12,21 +12,23 @@ import ( var readinessAndLivenessCatalog = newCatalog() var readinessOnlyCatalog = newCatalog() -var startupOnlyCatalog = newStartupCatalog() +var startupOnlyCatalog = newCatalog() // RegisterReadiness registers a component for readiness check with the default 30 seconds timeout, returns a token -func RegisterReadiness(name string) *Handle { - return readinessOnlyCatalog.register(name) +func RegisterReadiness(name string, options ...Option) *Handle { + return readinessOnlyCatalog.register(name, options...) } // RegisterLiveness registers a component for liveness check with the default 30 seconds timeout, returns a token -func RegisterLiveness(name string) *Handle { - return readinessAndLivenessCatalog.register(name) +func RegisterLiveness(name string, options ...Option) *Handle { + return readinessAndLivenessCatalog.register(name, options...) } // RegisterStartup registers a component for startup check, returns a token -func RegisterStartup(name string) *Handle { - return startupOnlyCatalog.register(name) +func RegisterStartup(name string, options ...Option) *Handle { + // Startup health checks are registered with Once option because, by design, they should stop being checked + // once they are marked as healthy once + return startupOnlyCatalog.register(name, append(options, Once)...) } // Deregister a component from the healthcheck diff --git a/pkg/status/health/health.go b/pkg/status/health/health.go index 7e3e3327bb794..0b1a0e8a69c96 100644 --- a/pkg/status/health/health.go +++ b/pkg/status/health/health.go @@ -29,33 +29,25 @@ type component struct { name string healthChan chan time.Time healthy bool + // if set to true, once the check is healthy, we mark it as healthy forever and we stop checking it + once bool } type catalog struct { sync.RWMutex components map[*Handle]*component latestRun time.Time - startup bool } func newCatalog() *catalog { return &catalog{ components: make(map[*Handle]*component), latestRun: time.Now(), // Start healthy - startup: false, - } -} - -func newStartupCatalog() *catalog { - return &catalog{ - components: make(map[*Handle]*component), - latestRun: time.Now(), // Start healthy - startup: true, } } // register a component with the default 30 seconds timeout, returns a token -func (c *catalog) register(name string) *Handle { +func (c *catalog) register(name string, options ...Option) *Handle { c.Lock() defer c.Unlock() @@ -68,6 +60,11 @@ func (c *catalog) register(name string) *Handle { healthChan: make(chan time.Time, bufferSize), healthy: false, } + + for _, option := range options { + option(component) + } + h := &Handle{ C: component.healthChan, } @@ -107,8 +104,8 @@ func (c *catalog) pingComponents(healthDeadline time.Time) bool { c.Lock() defer c.Unlock() for _, component := range c.components { - // In startup mode, we skip already healthy components. - if c.startup && component.healthy { + // We skip components that are registered to be skipped once they pass once + if component.healthy && component.once { continue } select { diff --git a/pkg/status/health/health_test.go b/pkg/status/health/health_test.go index f1d934fb157e5..37a57773a1d73 100644 --- a/pkg/status/health/health_test.go +++ b/pkg/status/health/health_test.go @@ -124,9 +124,9 @@ func TestGetHealthy(t *testing.T) { assert.Len(t, status.Unhealthy, 0) } -func TestStartupCatalog(t *testing.T) { - cat := newStartupCatalog() - token := cat.register("test1") +func TestCatalogWithOnceComponent(t *testing.T) { + cat := newCatalog() + token := cat.register("test1", Once) // Start unhealthy status := cat.getStatus() diff --git a/pkg/status/health/options.go b/pkg/status/health/options.go new file mode 100644 index 0000000000000..c2182a6a3d485 --- /dev/null +++ b/pkg/status/health/options.go @@ -0,0 +1,15 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +// Package health implements the internal healthcheck +package health + +// Option represents the application of an option to a component's health check +type Option func(*component) + +// Once has the effect of not checking the health of a component once it has been marked healthy once +func Once(c *component) { + c.once = true +}