From acb906164b6cc8b3604fa816cd3a99acffe991a2 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Thu, 24 Jun 2021 20:39:37 +0900 Subject: [PATCH] RunnerSet: Automatic-recovery from registration timeout and deregistration on pod termination (#652) Ref #629 Ref #613 Ref #612 --- acceptance/testdata/repo.runnerset.hra.yaml | 2 + acceptance/testdata/repo.runnerset.yaml | 4 + controllers/pod_runner_token_injector.go | 6 +- controllers/runner_controller.go | 18 +- controllers/runner_pod_controller.go | 431 ++++++++++++++++++++ main.go | 12 + 6 files changed, 465 insertions(+), 8 deletions(-) create mode 100644 controllers/runner_pod_controller.go diff --git a/acceptance/testdata/repo.runnerset.hra.yaml b/acceptance/testdata/repo.runnerset.hra.yaml index 6f08509e06..d7b76e901c 100644 --- a/acceptance/testdata/repo.runnerset.hra.yaml +++ b/acceptance/testdata/repo.runnerset.hra.yaml @@ -16,6 +16,8 @@ spec: # RunnerSet doesn't support scale from/to zero yet minReplicas: 1 maxReplicas: 5 + # This should be less than 600(seconds, the default) for faster testing + scaleDownDelaySecondsAfterScaleOut: 60 metrics: - type: PercentageRunnersBusy scaleUpThreshold: '0.75' diff --git a/acceptance/testdata/repo.runnerset.yaml b/acceptance/testdata/repo.runnerset.yaml index c3ea57c6e1..25c624cf0d 100644 --- a/acceptance/testdata/repo.runnerset.yaml +++ b/acceptance/testdata/repo.runnerset.yaml @@ -14,6 +14,10 @@ spec: serviceName: example-runnerset #replicas: 1 + + # From my limited testing, `ephemeral: true` is more reliable. + # Seomtimes, updating already deployed runners from `ephemeral: false` to `ephemeral: true` seems to + # result in queued jobs hanging forever. ephemeral: false repository: ${TEST_REPO} diff --git a/controllers/pod_runner_token_injector.go b/controllers/pod_runner_token_injector.go index 7b32051795..a186b432b4 100644 --- a/controllers/pod_runner_token_injector.go +++ b/controllers/pod_runner_token_injector.go @@ -17,6 +17,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" ) +const ( + AnnotationKeyTokenExpirationDate = "actions-runner-controller/token-expires-at" +) + // +kubebuilder:webhook:path=/mutate-runner-set-pod,mutating=true,failurePolicy=ignore,groups="",resources=pods,verbs=create,versions=v1,name=mutate-runner-pod.webhook.actions.summerwind.dev,sideEffects=None,admissionReviewVersions=v1beta1 type PodRunnerTokenInjector struct { @@ -72,7 +76,7 @@ func (t *PodRunnerTokenInjector) Handle(ctx context.Context, req admission.Reque updated := mutatePod(&pod, *rt.Token) - updated.Annotations["actions-runner-controller/token-expires-at"] = ts + updated.Annotations[AnnotationKeyTokenExpirationDate] = ts if pod.Spec.RestartPolicy != corev1.RestartPolicyOnFailure { updated.Spec.RestartPolicy = corev1.RestartPolicyOnFailure diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index bb51b7ac6f..3983a20f95 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -51,6 +51,10 @@ const ( // This is an annotation internal to actions-runner-controller and can change in backward-incompatible ways annotationKeyRegistrationOnly = "actions-runner-controller/registration-only" + + EnvVarOrg = "RUNNER_ORG" + EnvVarRepo = "RUNNER_REPO" + EnvVarEnterprise = "RUNNER_ENTERPRISE" ) // RunnerReconciler reconciles a Runner object @@ -89,7 +93,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr } if runner.ObjectMeta.DeletionTimestamp.IsZero() { - finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers) + finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers, finalizerName) if added { newRunner := runner.DeepCopy() @@ -103,7 +107,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } } else { - finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers) + finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName) if removed { if len(runner.Status.Registration.Token) > 0 { @@ -741,15 +745,15 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default env := []corev1.EnvVar{ { - Name: "RUNNER_ORG", + Name: EnvVarOrg, Value: runnerSpec.Organization, }, { - Name: "RUNNER_REPO", + Name: EnvVarRepo, Value: runnerSpec.Repository, }, { - Name: "RUNNER_ENTERPRISE", + Name: EnvVarEnterprise, Value: runnerSpec.Enterprise, }, { @@ -1029,7 +1033,7 @@ func (r *RunnerReconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(r) } -func addFinalizer(finalizers []string) ([]string, bool) { +func addFinalizer(finalizers []string, finalizerName string) ([]string, bool) { exists := false for _, name := range finalizers { if name == finalizerName { @@ -1044,7 +1048,7 @@ func addFinalizer(finalizers []string) ([]string, bool) { return append(finalizers, finalizerName), true } -func removeFinalizer(finalizers []string) ([]string, bool) { +func removeFinalizer(finalizers []string, finalizerName string) ([]string, bool) { removed := false result := []string{} diff --git a/controllers/runner_pod_controller.go b/controllers/runner_pod_controller.go new file mode 100644 index 0000000000..eadfcdb2eb --- /dev/null +++ b/controllers/runner_pod_controller.go @@ -0,0 +1,431 @@ +/* +Copyright 2020 The actions-runner-controller authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "errors" + "fmt" + "time" + + gogithub "github.com/google/go-github/v33/github" + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/go-logr/logr" + kerrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + corev1 "k8s.io/api/core/v1" + + "github.com/actions-runner-controller/actions-runner-controller/github" +) + +// RunnerPodReconciler reconciles a Runner object +type RunnerPodReconciler struct { + client.Client + Log logr.Logger + Recorder record.EventRecorder + Scheme *runtime.Scheme + GitHubClient *github.Client + Name string + RegistrationRecheckInterval time.Duration + RegistrationRecheckJitter time.Duration +} + +const ( + // This names requires at leaset one slash to work. + // See https://github.com/google/knative-gcp/issues/378 + runnerPodFinalizerName = "actions.summerwind.dev/runner-pod" + + AnnotationKeyLastRegistrationCheckTime = "actions-runner-controller/last-registration-check-time" +) + +// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch;delete +// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch + +func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := r.Log.WithValues("runnerpod", req.NamespacedName) + + var runnerPod corev1.Pod + if err := r.Get(ctx, req.NamespacedName, &runnerPod); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + _, isRunnerPod := runnerPod.Labels[LabelKeyRunnerSetName] + if !isRunnerPod { + return ctrl.Result{}, nil + } + + var enterprise, org, repo string + + envvars := runnerPod.Spec.Containers[0].Env + for _, e := range envvars { + switch e.Name { + case EnvVarEnterprise: + enterprise = e.Value + case EnvVarOrg: + org = e.Value + case EnvVarRepo: + repo = e.Value + } + } + + if runnerPod.ObjectMeta.DeletionTimestamp.IsZero() { + finalizers, added := addFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName) + + if added { + newRunner := runnerPod.DeepCopy() + newRunner.ObjectMeta.Finalizers = finalizers + + if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil { + log.Error(err, "Failed to update runner") + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil + } + } else { + finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName) + + if removed { + ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name) + if err != nil { + if errors.Is(err, &gogithub.RateLimitError{}) { + // We log the underlying error when we failed calling GitHub API to list or unregisters, + // or the runner is still busy. + log.Error( + err, + fmt.Sprintf( + "Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls", + retryDelayOnGitHubAPIRateLimitError, + ), + ) + + return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err + } + + return ctrl.Result{}, err + } + + if !ok { + log.V(1).Info("Runner no longer exists on GitHub") + } + + newRunner := runnerPod.DeepCopy() + newRunner.ObjectMeta.Finalizers = finalizers + + if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil { + log.Error(err, "Failed to update runner for finalizer removal") + return ctrl.Result{}, err + } + + log.Info("Removed runner from GitHub", "repository", repo, "organization", org) + } + + deletionTimeout := 1 * time.Minute + currentTime := time.Now() + deletionDidTimeout := currentTime.Sub(runnerPod.DeletionTimestamp.Add(deletionTimeout)) > 0 + + if deletionDidTimeout { + log.Info( + fmt.Sprintf("Failed to delete pod within %s. ", deletionTimeout)+ + "This is typically the case when a Kubernetes node became unreachable "+ + "and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.", + "podDeletionTimestamp", runnerPod.DeletionTimestamp, + "currentTime", currentTime, + "configuredDeletionTimeout", deletionTimeout, + ) + + var force int64 = 0 + // forcefully delete runner as we would otherwise get stuck if the node stays unreachable + if err := r.Delete(ctx, &runnerPod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil { + // probably + if !kerrors.IsNotFound(err) { + log.Error(err, "Failed to forcefully delete pod resource ...") + return ctrl.Result{}, err + } + // forceful deletion finally succeeded + return ctrl.Result{Requeue: true}, nil + } + + r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", runnerPod.Name)) + log.Info("Forcefully deleted runner pod", "repository", repo) + // give kube manager a little time to forcefully delete the stuck pod + return ctrl.Result{RequeueAfter: 3 * time.Second}, nil + } + + return ctrl.Result{}, nil + } + + // If pod has ended up succeeded we need to restart it + // Happens e.g. when dind is in runner and run completes + stopped := runnerPod.Status.Phase == corev1.PodSucceeded + + if !stopped { + if runnerPod.Status.Phase == corev1.PodRunning { + for _, status := range runnerPod.Status.ContainerStatuses { + if status.Name != containerName { + continue + } + + if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 { + stopped = true + } + } + } + } + + restart := stopped + + var registrationRecheckDelay time.Duration + + // all checks done below only decide whether a restart is needed + // if a restart was already decided before, there is no need for the checks + // saving API calls and scary log messages + if !restart { + registrationCheckInterval := time.Minute + if r.RegistrationRecheckInterval > 0 { + registrationCheckInterval = r.RegistrationRecheckInterval + } + + lastCheckTimeStr := runnerPod.Annotations[AnnotationKeyLastRegistrationCheckTime] + + var lastCheckTime *time.Time + + if lastCheckTimeStr != "" { + t, err := time.Parse(time.RFC3339, lastCheckTimeStr) + if err != nil { + log.Error(err, "failed to parase last check time %q", lastCheckTimeStr) + return ctrl.Result{}, nil + } + + lastCheckTime = &t + } + + // We want to call ListRunners GitHub Actions API only once per runner per minute. + // This if block, in conjunction with: + // return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil + // achieves that. + if lastCheckTime != nil { + nextCheckTime := lastCheckTime.Add(registrationCheckInterval) + now := time.Now() + + // Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds) + // so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is + // shorter than 1s + requeueAfter := nextCheckTime.Sub(now) - time.Second + if requeueAfter > 0 { + log.Info( + fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter), + "lastRegistrationCheckTime", lastCheckTime, + "registrationCheckInterval", registrationCheckInterval, + ) + + // Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the + // next sync period passes, which can be too much later than nextCheckTime. + // + // We need to requeue on this reconcilation even though we have already scheduled the initial + // requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`. + // Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on + // other requeues- so the initial scheduled requeue may have been reset due to requeue on + // spec/status change. + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + } + + notFound := false + offline := false + + _, err := r.GitHubClient.IsRunnerBusy(ctx, enterprise, org, repo, runnerPod.Name) + + currentTime := time.Now() + + if err != nil { + var notFoundException *github.RunnerNotFound + var offlineException *github.RunnerOffline + if errors.As(err, ¬FoundException) { + notFound = true + } else if errors.As(err, &offlineException) { + offline = true + } else { + var e *gogithub.RateLimitError + if errors.As(err, &e) { + // We log the underlying error when we failed calling GitHub API to list or unregisters, + // or the runner is still busy. + log.Error( + err, + fmt.Sprintf( + "Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls", + retryDelayOnGitHubAPIRateLimitError, + ), + ) + + return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err + } + + return ctrl.Result{}, err + } + } + + registrationTimeout := 10 * time.Minute + durationAfterRegistrationTimeout := currentTime.Sub(runnerPod.CreationTimestamp.Add(registrationTimeout)) + registrationDidTimeout := durationAfterRegistrationTimeout > 0 + + if notFound { + if registrationDidTimeout { + log.Info( + "Runner failed to register itself to GitHub in timely manner. "+ + "Recreating the pod to see if it resolves the issue. "+ + "CAUTION: If you see this a lot, you should investigate the root cause. "+ + "See https://github.com/actions-runner-controller/actions-runner-controller/issues/288", + "podCreationTimestamp", runnerPod.CreationTimestamp, + "currentTime", currentTime, + "configuredRegistrationTimeout", registrationTimeout, + ) + + restart = true + } else { + log.V(1).Info( + "Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.", + "runnerName", runnerPod.Name, + ) + } + } else if offline { + if registrationDidTimeout { + log.Info( + "Already existing GitHub runner still appears offline . "+ + "Recreating the pod to see if it resolves the issue. "+ + "CAUTION: If you see this a lot, you should investigate the root cause. ", + "podCreationTimestamp", runnerPod.CreationTimestamp, + "currentTime", currentTime, + "configuredRegistrationTimeout", registrationTimeout, + ) + + restart = true + } else { + log.V(1).Info( + "Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...", + "runnerName", runnerPod.Name, + ) + } + } + + if (notFound || offline) && !registrationDidTimeout { + registrationRecheckJitter := 10 * time.Second + if r.RegistrationRecheckJitter > 0 { + registrationRecheckJitter = r.RegistrationRecheckJitter + } + + registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1) + } + } + + // Don't do anything if there's no need to restart the runner + if !restart { + // This guard enables us to update runner.Status.Phase to `Running` only after + // the runner is registered to GitHub. + if registrationRecheckDelay > 0 { + log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay)) + + updated := runnerPod.DeepCopy() + t := time.Now().Format(time.RFC3339) + updated.Annotations[AnnotationKeyLastRegistrationCheckTime] = t + + if err := r.Patch(ctx, updated, client.MergeFrom(&runnerPod)); err != nil { + log.Error(err, "Failed to update runner pod annotation for LastRegistrationCheckTime") + return ctrl.Result{}, err + } + + return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil + } + + // Seeing this message, you can expect the runner to become `Running` soon. + log.Info( + "Runner appears to have registered and running.", + "podCreationTimestamp", runnerPod.CreationTimestamp, + ) + + return ctrl.Result{}, nil + } + + // Delete current pod if recreation is needed + if err := r.Delete(ctx, &runnerPod); err != nil { + log.Error(err, "Failed to delete pod resource") + return ctrl.Result{}, err + } + + r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", runnerPod.Name)) + log.Info("Deleted runner pod", "name", runnerPod.Name) + + return ctrl.Result{}, nil +} + +func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) { + runners, err := r.GitHubClient.ListRunners(ctx, enterprise, org, repo) + if err != nil { + return false, err + } + + var busy bool + + id := int64(0) + for _, runner := range runners { + if runner.GetName() == name { + // Sometimes a runner can stuck "busy" even though it is already "offline". + // Thus removing the condition on status can block the runner pod from being terminated forever. + busy = runner.GetBusy() + if runner.GetStatus() != "offline" && busy { + r.Log.Info("This runner will delay the runner pod deletion and the runner deregistration until it becomes either offline or non-busy", "name", runner.GetName(), "status", runner.GetStatus(), "busy", runner.GetBusy()) + return false, fmt.Errorf("runner is busy") + } + id = runner.GetID() + break + } + } + + if id == int64(0) { + return false, nil + } + + // Sometimes a runner can stuck "busy" even though it is already "offline". + // Trying to remove the offline but busy runner can result in errors like the following: + // failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" [] + if !busy { + if err := r.GitHubClient.RemoveRunner(ctx, enterprise, org, repo, id); err != nil { + return false, err + } + } + + return true, nil +} + +func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error { + name := "runnerpod-controller" + if r.Name != "" { + name = r.Name + } + + r.Recorder = mgr.GetEventRecorderFor(name) + + return ctrl.NewControllerManagedBy(mgr). + For(&corev1.Pod{}). + Named(name). + Complete(r) +} diff --git a/main.go b/main.go index 1401c1991d..cec8f5a355 100644 --- a/main.go +++ b/main.go @@ -215,6 +215,18 @@ func main() { CacheDuration: gitHubAPICacheDuration, } + runnerPodReconciler := &controllers.RunnerPodReconciler{ + Client: mgr.GetClient(), + Log: log.WithName("runnerpod"), + Scheme: mgr.GetScheme(), + GitHubClient: ghClient, + } + + if err = runnerPodReconciler.SetupWithManager(mgr); err != nil { + log.Error(err, "unable to create controller", "controller", "RunnerPod") + os.Exit(1) + } + if err = horizontalRunnerAutoscaler.SetupWithManager(mgr); err != nil { log.Error(err, "unable to create controller", "controller", "HorizontalRunnerAutoscaler") os.Exit(1)