From 7d8c779dd6955d40d4e122977f2ccdc4ab0eee31 Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Fri, 21 Jul 2023 07:14:21 +0200 Subject: [PATCH] ClusterCacheTracker: fix accessor deletion on health check failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stefan Büringer buringerst@vmware.com --- controllers/remote/cluster_cache_tracker.go | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/controllers/remote/cluster_cache_tracker.go b/controllers/remote/cluster_cache_tracker.go index 9a1c42be0914..9bd40ceab13f 100644 --- a/controllers/remote/cluster_cache_tracker.go +++ b/controllers/remote/cluster_cache_tracker.go @@ -654,14 +654,18 @@ func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *health } err := wait.PollUntilContextCancel(ctx, in.interval, true, runHealthCheckWithThreshold) - // An error returned implies the health check has failed a sufficient number of - // times for the cluster to be considered unhealthy - // NB. we are ignoring ErrWaitTimeout because this error happens when the channel is close, that in this case - // happens when the cache is explicitly stopped. - if err != nil && !wait.Interrupted(err) { + // An error returned implies the health check has failed a sufficient number of times for the cluster + // to be considered unhealthy or the cache was stopped and thus the cache context canceled (we pass the + // cache context into wait.PollUntilContextCancel). + // NB. Log all errors that occurred even if this error might just be from a cancel of the cache context + // when the cache is stopped. Logging an error in this case is not a problem and makes debugging easier. + if err != nil { t.log.Error(err, "Error health checking cluster", "Cluster", klog.KRef(in.cluster.Namespace, in.cluster.Name)) - t.deleteAccessor(ctx, in.cluster) } + // Ensure in any case that the accessor is deleted (even if it is a no-op). + // NB. It is crucial to ensure the accessor was deleted, so it can be later recreated when the + // cluster is reachable again + t.deleteAccessor(ctx, in.cluster) } // newClientWithTimeout returns a new client which sets the specified timeout on all Get and List calls.