Skip to content

Commit

Permalink
Prevent RemoveRunner spam on busy ephemeral runner scale down (#1204)
Browse files Browse the repository at this point in the history
Since #1127 and #1167, we had been retrying `RemoveRunner` API call on each graceful runner stop attempt when the runner was still busy.
There was no reliable way to throttle the retry attempts. The combination of these resulted in ARC spamming RemoveRunner calls(one call per reconciliation loop but the loop runs quite often due to how the controller works) when it failed once due to that the runner is in the middle of running a workflow job.

This fixes that, by adding a few short-circuit conditions that would work for ephemeral runners. An ephemeral runner can unregister itself on completion so in most of cases ARC can just wait for the runner to stop if it's already running a job. As a RemoveRunner response of status 422 implies that the runner is running a job, we can use that as a trigger to start the runner stop waiter.

The end result is that 422 errors will be observed at most once per the whole graceful termination process of an ephemeral runner pod. RemoveRunner API calls are never retried for ephemeral runners. ARC consumes less GitHub API rate limit budget and logs are much cleaner than before.

Ref #1167 (comment)
  • Loading branch information
mumoshu authored Mar 11, 2022
1 parent 736a53f commit 9628bb2
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 21 deletions.
4 changes: 4 additions & 0 deletions controllers/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ const (
// AnnotationKeyUnregistrationCompleteTimestamp is the annotation that is added onto the pod once the previously started unregistration process has been completed.
AnnotationKeyUnregistrationCompleteTimestamp = annotationKeyPrefix + "unregistration-complete-timestamp"

// AnnotationKeyRunnerCompletionWaitStartTimestamp is the annotation that is added onto the pod when
// ARC decided to wait until the pod to complete by itself, without the need for ARC to unregister the corresponding runner.
AnnotationKeyRunnerCompletionWaitStartTimestamp = annotationKeyPrefix + "runner-completion-wait-start-timestamp"

// unregistarionStartTimestamp is the annotation that contains the time that the requested unregistration process has been started
AnnotationKeyUnregistrationStartTimestamp = annotationKeyPrefix + "unregistration-start-timestamp"

Expand Down
3 changes: 2 additions & 1 deletion controllers/runner_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ const (
EnvVarOrg = "RUNNER_ORG"
EnvVarRepo = "RUNNER_REPO"
EnvVarEnterprise = "RUNNER_ENTERPRISE"
EnvVarEphemeral = "RUNNER_EPHEMERAL"
)

// RunnerReconciler reconciles a Runner object
Expand Down Expand Up @@ -535,7 +536,7 @@ func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.Ru
Value: workDir,
},
{
Name: "RUNNER_EPHEMERAL",
Name: EnvVarEphemeral,
Value: fmt.Sprintf("%v", ephemeral),
},
}
Expand Down
82 changes: 62 additions & 20 deletions controllers/runner_graceful_stop.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Dura
return nil, &ctrl.Result{}, err
}

if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, enterprise, organization, repository, runner, pod); res != nil {
if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil {
return nil, res, err
}

Expand Down Expand Up @@ -69,7 +69,7 @@ func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod
}

// If the first return value is nil, it's safe to delete the runner pod.
func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
var runnerID *int64

if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok {
Expand All @@ -81,8 +81,25 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
runnerID = &v
}

ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, runnerID)
if err != nil {
code := runnerContainerExitCode(pod)

if pod != nil && pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" {
// If it's already unregistered in the previous reconcilation loop,
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
log.Info("Runner pod is marked as already unregistered.")
} else if pod != nil && runnerPodOrContainerIsStopped(pod) {
// If it's an ephemeral runner with the actions/runner container exited with 0,
// we can safely assume that it has unregistered itself from GitHub Actions
// so it's natural that RemoveRunner fails due to 404.

// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
log.Info("Runner pod has been stopped with a successful status.")
} else if pod != nil && pod.Annotations[AnnotationKeyRunnerCompletionWaitStartTimestamp] != "" {
log.Info("Runner pod is annotated to wait for completion")

return &ctrl.Result{RequeueAfter: retryDelay}, nil
} else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, runnerID); err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
Expand All @@ -97,12 +114,12 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}

log.Error(err, "Failed to unregister runner before deleting the pod.")
log.V(1).Info("Failed to unregister runner before deleting the pod.", "error", err)

var runnerBusy bool

errRes := &gogithub.ErrorResponse{}
if errors.As(err, &errRes) {
code := runnerContainerExitCode(pod)

runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner)

var runnerID int64
Expand All @@ -111,7 +128,9 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
runnerID = *runner.ID
}

if errRes.Response.StatusCode == 422 && code != nil {
runnerBusy = errRes.Response.StatusCode == 422

if runnerBusy && code != nil {
log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+
"This can happen when the runner container crashed due to an unhandled error, OOM, etc. "+
"ARC terminates the pod anyway. You'd probably need to manually delete the runner later by calling the GitHub API",
Expand All @@ -123,6 +142,28 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
}
}

if runnerBusy {
// We want to prevent spamming the deletion attemps but returning ctrl.Result with RequeueAfter doesn't
// work as the reconcilation can happen earlier due to pod status update.
// For ephemeral runners, we can expect it to stop and unregister itself on completion.
// So we can just wait for the completion without actively retrying unregistration.
ephemeral := getRunnerEnv(pod, EnvVarEphemeral)
if ephemeral == "true" {
pod, err = annotatePodOnce(ctx, c, log, pod, AnnotationKeyRunnerCompletionWaitStartTimestamp, time.Now().Format(time.RFC3339))
if err != nil {
return &ctrl.Result{}, err
}

return &ctrl.Result{}, nil
}

log.V(2).Info("Retrying runner unregistration because the static runner is still busy")
// Otherwise we may end up spamming 422 errors,
// each call consuming GitHub API rate limit
// https://github.com/actions-runner-controller/actions-runner-controller/pull/1167#issuecomment-1064213271
return &ctrl.Result{RequeueAfter: retryDelay}, nil
}

return &ctrl.Result{}, err
} else if ok {
log.Info("Runner has just been unregistered.")
Expand All @@ -135,18 +176,6 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
// In that case we can safely assume that the runner will never be registered.

log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
} else if pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" {
// If it's already unregistered in the previous reconcilation loop,
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
log.Info("Runner pod is marked as already unregistered.")
} else if runnerPodOrContainerIsStopped(pod) {
// If it's an ephemeral runner with the actions/runner container exited with 0,
// we can safely assume that it has unregistered itself from GitHub Actions
// so it's natural that RemoveRunner fails due to 404.

// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
log.Info("Runner pod has been stopped with a successful status.")
} else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" {
t, err := time.Parse(time.RFC3339, ts)
if err != nil {
Expand Down Expand Up @@ -241,6 +270,19 @@ func podRunnerID(pod *corev1.Pod) string {
return id
}

func getRunnerEnv(pod *corev1.Pod, key string) string {
for _, c := range pod.Spec.Containers {
if c.Name == containerName {
for _, e := range c.Env {
if e.Name == key {
return e.Value
}
}
}
}
return ""
}

// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
Expand Down

0 comments on commit 9628bb2

Please sign in to comment.