From 9e19a84dbe570b83191395a92e6ebeeb4f234ee7 Mon Sep 17 00:00:00 2001 From: Hao Liu Date: Tue, 9 May 2023 17:06:29 -0400 Subject: [PATCH] Respect ctx cancel I think there's a race condition between controller and receptor controller see the job finished and try to cancel workunit receptor does not respect the cancel and continue to do GET to kube apiserver with a dead ctx which cause a very misleading error message ``` client rate limiter Wait returned an error: context canceled ``` --- pkg/workceptor/kubernetes.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/workceptor/kubernetes.go b/pkg/workceptor/kubernetes.go index 78ec744b0..1ed304191 100644 --- a/pkg/workceptor/kubernetes.go +++ b/pkg/workceptor/kubernetes.go @@ -321,6 +321,15 @@ func (kw *kubeUnit) runWorkUsingLogger() { // resuming from a previously created pod var err error for retries := 5; retries > 0; retries-- { + // check if the kw.ctx is already cancel + select { + case <-kw.ctx.Done(): + errMsg := fmt.Sprintf("Context Done while getting pod %s/%s. Error: %s", podNamespace, podName, kw.ctx.Err()) + kw.Warning(errMsg) + return + default: + } + kw.pod, err = kw.clientset.CoreV1().Pods(podNamespace).Get(kw.ctx, podName, metav1.GetOptions{}) if err == nil { break