Skip to content

Commit

Permalink
[Release-1.23] Consolidate staticPod timeout to static 30 minutes
Browse files Browse the repository at this point in the history
* Consolidate staticPod timeout to static 30 minutes
* Close files and simplify filter
* Convert to PollImmediate
Signed-off-by: Derek Nola <[email protected]>
  • Loading branch information
dereknola authored Jul 19, 2022
1 parent ba9ef87 commit 2d206eb
Showing 1 changed file with 31 additions and 54 deletions.
85 changes: 31 additions & 54 deletions pkg/rke2/spw.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package rke2

import (
"context"
"fmt"
"os"
"path/filepath"
"sync"
Expand All @@ -11,29 +12,12 @@ import (
"github.com/k3s-io/k3s/pkg/cli/cmds"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"google.golang.org/grpc"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/pkg/util/yaml"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
)

// total timeout of 4 minutes
var podCheckBackoff = wait.Backoff{
Steps: 12,
Duration: 15 * time.Second,
Factor: 1.0,
Jitter: 0.1,
}

// total timeout of 2047 seconds (34 minutes)
var criBackoff = wait.Backoff{
Steps: 12,
Duration: 1 * time.Second,
Factor: 2,
Jitter: 0.1,
}

// checkStaticManifests validates that the pods started with rke2 match the static manifests
// provided in /var/lib/rancher/rke2/agent/pod-manifests. When restarting rke2, it takes time
// for any changes to static manifests to be pulled by kubelet. Additionally this prevents errors
Expand All @@ -42,57 +26,50 @@ func checkStaticManifests(dataDir string) cmds.StartupHook {
return func(ctx context.Context, wg *sync.WaitGroup, args cmds.StartupHookArgs) error {
go func() {
defer wg.Done()
if err := wait.PollImmediate(20*time.Second, 30*time.Minute, func() (bool, error) {

var conn *grpc.ClientConn
if err := wait.ExponentialBackoff(criBackoff, func() (done bool, err error) {
conn, err = containerdk3s.CriConnection(ctx, containerdSock)
conn, err := containerdk3s.CriConnection(ctx, containerdSock)
if err != nil {
logrus.Infof("Waiting for cri connection: %v", err)
return false, nil
}
return true, nil
}); err != nil {
logrus.Fatalf("failed to setup cri connection: %v", err)
}
cRuntime := runtimeapi.NewRuntimeServiceClient(conn)
defer conn.Close()
cRuntime := runtimeapi.NewRuntimeServiceClient(conn)
defer conn.Close()

manifestDir := podManifestsDir(dataDir)
manifestDir := podManifestsDir(dataDir)

for _, pod := range []string{"etcd", "kube-apiserver"} {
manifestFile := filepath.Join(manifestDir, pod+".yaml")
if f, err := os.Open(manifestFile); err == nil {
podManifest := v1.Pod{}
decoder := yaml.NewYAMLToJSONDecoder(f)
err = decoder.Decode(&podManifest)
if err != nil {
logrus.Fatalf("Failed to decode %s manifest: %v", pod, err)
}
podFilter := &runtimeapi.ContainerFilter{
LabelSelector: map[string]string{
"io.kubernetes.container.name": pod,
},
}
if err := wait.ExponentialBackoff(podCheckBackoff, func() (done bool, err error) {
for _, pod := range []string{"etcd", "kube-apiserver"} {
manifestFile := filepath.Join(manifestDir, pod+".yaml")
if f, err := os.Open(manifestFile); err == nil {
defer f.Close()
podManifest := v1.Pod{}
decoder := yaml.NewYAMLToJSONDecoder(f)
err = decoder.Decode(&podManifest)
if err != nil {
logrus.Fatalf("Failed to decode %s manifest: %v", pod, err)
}
podFilter := &runtimeapi.ContainerFilter{
LabelSelector: map[string]string{
"io.kubernetes.pod.uid": string(podManifest.UID),
},
}
resp, err := cRuntime.ListContainers(ctx, &runtimeapi.ListContainersRequest{Filter: podFilter})
if err != nil {
return false, err
}
for _, c := range resp.Containers {
if c.Labels["io.kubernetes.pod.uid"] == string(podManifest.UID) {
logrus.Infof("Latest %s manifest deployed", pod)
return true, nil
}
if len(resp.Containers) < 1 {
logrus.Infof("%s pod not found, retrying", pod)
return false, nil
}
logrus.Infof("Waiting for %s manifest", pod)
return false, nil
}); err != nil {
logrus.Fatalf("Failed to wait for latest %s manifest to be deployed: %v", pod, err)
logrus.Infof("Latest %s manifest deployed", pod)
} else if !errors.Is(err, os.ErrNotExist) {
// Since split-role servers exist, we don't care if no manifest is found
return false, fmt.Errorf("failed to open %s manifest: %v", pod, err)
}
} else if !errors.Is(err, os.ErrNotExist) {
// Since split-role servers exist, we don't care if no manifest is found
logrus.Fatalf("Failed to open %s manifest: %v", pod, err)
}
return true, nil
}); err != nil {
logrus.Fatalf("Failed waiting for manifests to deploy: %v", err)
}
}()
return nil
Expand Down

0 comments on commit 2d206eb

Please sign in to comment.