diff --git a/pkg/rke2/spw.go b/pkg/rke2/spw.go index ff88164e40..f8636c72c3 100644 --- a/pkg/rke2/spw.go +++ b/pkg/rke2/spw.go @@ -2,6 +2,7 @@ package rke2 import ( "context" + "fmt" "os" "path/filepath" "sync" @@ -11,29 +12,12 @@ import ( "github.com/k3s-io/k3s/pkg/cli/cmds" "github.com/pkg/errors" "github.com/sirupsen/logrus" - "google.golang.org/grpc" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/util/yaml" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" ) -// total timeout of 4 minutes -var podCheckBackoff = wait.Backoff{ - Steps: 12, - Duration: 15 * time.Second, - Factor: 1.0, - Jitter: 0.1, -} - -// total timeout of 2047 seconds (34 minutes) -var criBackoff = wait.Backoff{ - Steps: 12, - Duration: 1 * time.Second, - Factor: 2, - Jitter: 0.1, -} - // checkStaticManifests validates that the pods started with rke2 match the static manifests // provided in /var/lib/rancher/rke2/agent/pod-manifests. When restarting rke2, it takes time // for any changes to static manifests to be pulled by kubelet. Additionally this prevents errors @@ -42,57 +26,50 @@ func checkStaticManifests(dataDir string) cmds.StartupHook { return func(ctx context.Context, wg *sync.WaitGroup, args cmds.StartupHookArgs) error { go func() { defer wg.Done() + if err := wait.PollImmediate(20*time.Second, 30*time.Minute, func() (bool, error) { - var conn *grpc.ClientConn - if err := wait.ExponentialBackoff(criBackoff, func() (done bool, err error) { - conn, err = containerdk3s.CriConnection(ctx, containerdSock) + conn, err := containerdk3s.CriConnection(ctx, containerdSock) if err != nil { logrus.Infof("Waiting for cri connection: %v", err) return false, nil } - return true, nil - }); err != nil { - logrus.Fatalf("failed to setup cri connection: %v", err) - } - cRuntime := runtimeapi.NewRuntimeServiceClient(conn) - defer conn.Close() + cRuntime := runtimeapi.NewRuntimeServiceClient(conn) + defer conn.Close() - manifestDir := podManifestsDir(dataDir) + manifestDir := podManifestsDir(dataDir) - for _, pod := range []string{"etcd", "kube-apiserver"} { - manifestFile := filepath.Join(manifestDir, pod+".yaml") - if f, err := os.Open(manifestFile); err == nil { - podManifest := v1.Pod{} - decoder := yaml.NewYAMLToJSONDecoder(f) - err = decoder.Decode(&podManifest) - if err != nil { - logrus.Fatalf("Failed to decode %s manifest: %v", pod, err) - } - podFilter := &runtimeapi.ContainerFilter{ - LabelSelector: map[string]string{ - "io.kubernetes.container.name": pod, - }, - } - if err := wait.ExponentialBackoff(podCheckBackoff, func() (done bool, err error) { + for _, pod := range []string{"etcd", "kube-apiserver"} { + manifestFile := filepath.Join(manifestDir, pod+".yaml") + if f, err := os.Open(manifestFile); err == nil { + defer f.Close() + podManifest := v1.Pod{} + decoder := yaml.NewYAMLToJSONDecoder(f) + err = decoder.Decode(&podManifest) + if err != nil { + logrus.Fatalf("Failed to decode %s manifest: %v", pod, err) + } + podFilter := &runtimeapi.ContainerFilter{ + LabelSelector: map[string]string{ + "io.kubernetes.pod.uid": string(podManifest.UID), + }, + } resp, err := cRuntime.ListContainers(ctx, &runtimeapi.ListContainersRequest{Filter: podFilter}) if err != nil { return false, err } - for _, c := range resp.Containers { - if c.Labels["io.kubernetes.pod.uid"] == string(podManifest.UID) { - logrus.Infof("Latest %s manifest deployed", pod) - return true, nil - } + if len(resp.Containers) < 1 { + logrus.Infof("%s pod not found, retrying", pod) + return false, nil } - logrus.Infof("Waiting for %s manifest", pod) - return false, nil - }); err != nil { - logrus.Fatalf("Failed to wait for latest %s manifest to be deployed: %v", pod, err) + logrus.Infof("Latest %s manifest deployed", pod) + } else if !errors.Is(err, os.ErrNotExist) { + // Since split-role servers exist, we don't care if no manifest is found + return false, fmt.Errorf("failed to open %s manifest: %v", pod, err) } - } else if !errors.Is(err, os.ErrNotExist) { - // Since split-role servers exist, we don't care if no manifest is found - logrus.Fatalf("Failed to open %s manifest: %v", pod, err) } + return true, nil + }); err != nil { + logrus.Fatalf("Failed waiting for manifests to deploy: %v", err) } }() return nil