Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 Fix self-hosted flakes in E2E tests #3639

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions test/e2e/self_hosted.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"sigs.k8s.io/controller-runtime/pkg/client"

corev1 "k8s.io/api/core/v1"
"k8s.io/utils/pointer"
Expand Down Expand Up @@ -96,7 +97,6 @@ func SelfHostedSpec(ctx context.Context, inputGetter func() SelfHostedSpecInput)
})

By("Turning the workload cluster into a management cluster")
//TODO: refactor into an helper func e.g. "UpgradeToManagementCluster"

// In case of the cluster id a DockerCluster, we should load controller images into the nodes.
// Nb. this can be achieved also by changing the DockerMachine spec, but for the time being we are using
Expand Down Expand Up @@ -127,7 +127,19 @@ func SelfHostedSpec(ctx context.Context, inputGetter func() SelfHostedSpecInput)
LogFolder: filepath.Join(input.ArtifactFolder, "clusters", cluster.Name),
}, input.E2EConfig.GetIntervals(specName, "wait-controllers")...)

//TODO: refactor in to an helper func e.g. "MoveToSelfHostedAndWait"
By("Ensure API servers are stable before doing move")
// Nb. This check was introduced to prevent doing move to self-hosted in an aggressive way and thus avoid flakes.
// More specifically, we were observing the test failing to get objects from the API server during move, so we
// are now testing the API servers are stable before starting move.
Consistently(func() error {
kubeSystem := &corev1.Namespace{}
return input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKey{Name: "kube-system"}, kubeSystem)
}, "5s", "100ms").Should(BeNil(), "Failed to assert bootstrap API server stability")
Consistently(func() error {
kubeSystem := &corev1.Namespace{}
return selfHostedClusterProxy.GetClient().Get(ctx, client.ObjectKey{Name: "kube-system"}, kubeSystem)
}, "5s", "100ms").Should(BeNil(), "Failed to assert self-hosted API server stability")

By("Moving the cluster to self hosted")
clusterctl.Move(context.TODO(), clusterctl.MoveInput{
LogFolder: filepath.Join(input.ArtifactFolder, "clusters", "bootstrap"),
Expand Down Expand Up @@ -155,7 +167,6 @@ func SelfHostedSpec(ctx context.Context, inputGetter func() SelfHostedSpecInput)
})

AfterEach(func() {
//TODO: refactor in to an helper func e.g. "MoveToBootstrapAndWait"
if selfHostedNamespace != nil {
// Dump all Cluster API related resources to artifacts before pivoting back.
framework.DumpAllResources(ctx, framework.DumpAllResourcesInput{
Expand All @@ -165,6 +176,19 @@ func SelfHostedSpec(ctx context.Context, inputGetter func() SelfHostedSpecInput)
})
}
if selfHostedCluster != nil {
By("Ensure API servers are stable before doing move")
// Nb. This check was introduced to prevent doing move back to bootstrap in an aggressive way and thus avoid flakes.
// More specifically, we were observing the test failing to get objects from the API server during move, so we
// are now testing the API servers are stable before starting move.
Consistently(func() error {
kubeSystem := &corev1.Namespace{}
return input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKey{Name: "kube-system"}, kubeSystem)
}, "5s", "100ms").Should(BeNil(), "Failed to assert bootstrap API server stability")
Consistently(func() error {
kubeSystem := &corev1.Namespace{}
return selfHostedClusterProxy.GetClient().Get(ctx, client.ObjectKey{Name: "kube-system"}, kubeSystem)
}, "5s", "100ms").Should(BeNil(), "Failed to assert self-hosted API server stability")

By("Moving the cluster back to bootstrap")
clusterctl.Move(ctx, clusterctl.MoveInput{
LogFolder: filepath.Join(input.ArtifactFolder, "clusters", cluster.Name),
Expand Down