From 9628bb29371441e7b329a71bc92089afc677c52d Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Fri, 11 Mar 2022 19:03:17 +0900 Subject: [PATCH] Prevent RemoveRunner spam on busy ephemeral runner scale down (#1204) Since #1127 and #1167, we had been retrying `RemoveRunner` API call on each graceful runner stop attempt when the runner was still busy. There was no reliable way to throttle the retry attempts. The combination of these resulted in ARC spamming RemoveRunner calls(one call per reconciliation loop but the loop runs quite often due to how the controller works) when it failed once due to that the runner is in the middle of running a workflow job. This fixes that, by adding a few short-circuit conditions that would work for ephemeral runners. An ephemeral runner can unregister itself on completion so in most of cases ARC can just wait for the runner to stop if it's already running a job. As a RemoveRunner response of status 422 implies that the runner is running a job, we can use that as a trigger to start the runner stop waiter. The end result is that 422 errors will be observed at most once per the whole graceful termination process of an ephemeral runner pod. RemoveRunner API calls are never retried for ephemeral runners. ARC consumes less GitHub API rate limit budget and logs are much cleaner than before. Ref https://github.com/actions-runner-controller/actions-runner-controller/pull/1167#issuecomment-1064213271 --- controllers/constants.go | 4 ++ controllers/runner_controller.go | 3 +- controllers/runner_graceful_stop.go | 82 ++++++++++++++++++++++------- 3 files changed, 68 insertions(+), 21 deletions(-) diff --git a/controllers/constants.go b/controllers/constants.go index 68e1db373e..5b67026bf1 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -18,6 +18,10 @@ const ( // AnnotationKeyUnregistrationCompleteTimestamp is the annotation that is added onto the pod once the previously started unregistration process has been completed. AnnotationKeyUnregistrationCompleteTimestamp = annotationKeyPrefix + "unregistration-complete-timestamp" + // AnnotationKeyRunnerCompletionWaitStartTimestamp is the annotation that is added onto the pod when + // ARC decided to wait until the pod to complete by itself, without the need for ARC to unregister the corresponding runner. + AnnotationKeyRunnerCompletionWaitStartTimestamp = annotationKeyPrefix + "runner-completion-wait-start-timestamp" + // unregistarionStartTimestamp is the annotation that contains the time that the requested unregistration process has been started AnnotationKeyUnregistrationStartTimestamp = annotationKeyPrefix + "unregistration-start-timestamp" diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 7b300777c2..6fd816f2c1 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -53,6 +53,7 @@ const ( EnvVarOrg = "RUNNER_ORG" EnvVarRepo = "RUNNER_REPO" EnvVarEnterprise = "RUNNER_ENTERPRISE" + EnvVarEphemeral = "RUNNER_EPHEMERAL" ) // RunnerReconciler reconciles a Runner object @@ -535,7 +536,7 @@ func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.Ru Value: workDir, }, { - Name: "RUNNER_EPHEMERAL", + Name: EnvVarEphemeral, Value: fmt.Sprintf("%v", ephemeral), }, } diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index d76bbae01d..2a18b33868 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -32,7 +32,7 @@ func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Dura return nil, &ctrl.Result{}, err } - if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, enterprise, organization, repository, runner, pod); res != nil { + if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil { return nil, res, err } @@ -69,7 +69,7 @@ func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod } // If the first return value is nil, it's safe to delete the runner pod. -func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { +func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { var runnerID *int64 if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok { @@ -81,8 +81,25 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. runnerID = &v } - ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, runnerID) - if err != nil { + code := runnerContainerExitCode(pod) + + if pod != nil && pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" { + // If it's already unregistered in the previous reconcilation loop, + // you can safely assume that it won't get registered again so it's safe to delete the runner pod. + log.Info("Runner pod is marked as already unregistered.") + } else if pod != nil && runnerPodOrContainerIsStopped(pod) { + // If it's an ephemeral runner with the actions/runner container exited with 0, + // we can safely assume that it has unregistered itself from GitHub Actions + // so it's natural that RemoveRunner fails due to 404. + + // If pod has ended up succeeded we need to restart it + // Happens e.g. when dind is in runner and run completes + log.Info("Runner pod has been stopped with a successful status.") + } else if pod != nil && pod.Annotations[AnnotationKeyRunnerCompletionWaitStartTimestamp] != "" { + log.Info("Runner pod is annotated to wait for completion") + + return &ctrl.Result{RequeueAfter: retryDelay}, nil + } else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, runnerID); err != nil { if errors.Is(err, &gogithub.RateLimitError{}) { // We log the underlying error when we failed calling GitHub API to list or unregisters, // or the runner is still busy. @@ -97,12 +114,12 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err } - log.Error(err, "Failed to unregister runner before deleting the pod.") + log.V(1).Info("Failed to unregister runner before deleting the pod.", "error", err) + + var runnerBusy bool errRes := &gogithub.ErrorResponse{} if errors.As(err, &errRes) { - code := runnerContainerExitCode(pod) - runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner) var runnerID int64 @@ -111,7 +128,9 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. runnerID = *runner.ID } - if errRes.Response.StatusCode == 422 && code != nil { + runnerBusy = errRes.Response.StatusCode == 422 + + if runnerBusy && code != nil { log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+ "This can happen when the runner container crashed due to an unhandled error, OOM, etc. "+ "ARC terminates the pod anyway. You'd probably need to manually delete the runner later by calling the GitHub API", @@ -123,6 +142,28 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. } } + if runnerBusy { + // We want to prevent spamming the deletion attemps but returning ctrl.Result with RequeueAfter doesn't + // work as the reconcilation can happen earlier due to pod status update. + // For ephemeral runners, we can expect it to stop and unregister itself on completion. + // So we can just wait for the completion without actively retrying unregistration. + ephemeral := getRunnerEnv(pod, EnvVarEphemeral) + if ephemeral == "true" { + pod, err = annotatePodOnce(ctx, c, log, pod, AnnotationKeyRunnerCompletionWaitStartTimestamp, time.Now().Format(time.RFC3339)) + if err != nil { + return &ctrl.Result{}, err + } + + return &ctrl.Result{}, nil + } + + log.V(2).Info("Retrying runner unregistration because the static runner is still busy") + // Otherwise we may end up spamming 422 errors, + // each call consuming GitHub API rate limit + // https://github.com/actions-runner-controller/actions-runner-controller/pull/1167#issuecomment-1064213271 + return &ctrl.Result{RequeueAfter: retryDelay}, nil + } + return &ctrl.Result{}, err } else if ok { log.Info("Runner has just been unregistered.") @@ -135,18 +176,6 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. // In that case we can safely assume that the runner will never be registered. log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.") - } else if pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" { - // If it's already unregistered in the previous reconcilation loop, - // you can safely assume that it won't get registered again so it's safe to delete the runner pod. - log.Info("Runner pod is marked as already unregistered.") - } else if runnerPodOrContainerIsStopped(pod) { - // If it's an ephemeral runner with the actions/runner container exited with 0, - // we can safely assume that it has unregistered itself from GitHub Actions - // so it's natural that RemoveRunner fails due to 404. - - // If pod has ended up succeeded we need to restart it - // Happens e.g. when dind is in runner and run completes - log.Info("Runner pod has been stopped with a successful status.") } else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" { t, err := time.Parse(time.RFC3339, ts) if err != nil { @@ -241,6 +270,19 @@ func podRunnerID(pod *corev1.Pod) string { return id } +func getRunnerEnv(pod *corev1.Pod, key string) string { + for _, c := range pod.Spec.Containers { + if c.Name == containerName { + for _, e := range c.Env { + if e.Name == key { + return e.Value + } + } + } + } + return "" +} + // unregisterRunner unregisters the runner from GitHub Actions by name. // // This function returns: