From f73a27729acf3bc2cce2479220e1460eb6f73efb Mon Sep 17 00:00:00 2001 From: Yuvaraj Kakaraparthi Date: Wed, 23 Feb 2022 08:49:54 -0800 Subject: [PATCH] prevent blocking of KCP and DockerMachine controllers KCP and DockerMachine controllers are currently blocked if the infrastrucutre is not ready. This leads to cases where if the infrstructure is not ready we cannot delete the cluster. This PR fixes this behavior and prevents blocking the controllers for this case. --- .../kubeadm/internal/controllers/controller.go | 12 ++++++------ .../internal/controllers/controller_test.go | 6 +++++- .../kubeadm/internal/controllers/scale_test.go | 1 + .../kubeadm/internal/controllers/upgrade_test.go | 1 + .../developer/providers/machine-infrastructure.md | 2 ++ docs/book/src/developer/providers/v1.1-to-v1.2.md | 5 +++++ test/framework/controlplane_helpers.go | 4 ++-- .../controllers/dockermachine_controller.go | 14 +++++++------- 8 files changed, 29 insertions(+), 16 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index fd9b5ad93175..081c57de3026 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -149,12 +149,6 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl. return ctrl.Result{}, nil } - // Wait for the cluster infrastructure to be ready before creating machines - if !cluster.Status.InfrastructureReady { - log.Info("Cluster infrastructure is not ready yet") - return ctrl.Result{}, nil - } - // Initialize the patch helper. patchHelper, err := patch.NewHelper(kcp, r.Client) if err != nil { @@ -255,6 +249,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * return ctrl.Result{}, err } + // Wait for the cluster infrastructure to be ready before creating machines + if !cluster.Status.InfrastructureReady { + log.Info("Cluster infrastructure is not ready yet") + return ctrl.Result{}, nil + } + // Generate Cluster Certificates if needed config := kcp.Spec.KubeadmConfigSpec.DeepCopy() config.JoinConfiguration = nil diff --git a/controlplane/kubeadm/internal/controllers/controller_test.go b/controlplane/kubeadm/internal/controllers/controller_test.go index 011fef2775eb..6c8f9dade06b 100644 --- a/controlplane/kubeadm/internal/controllers/controller_test.go +++ b/controlplane/kubeadm/internal/controllers/controller_test.go @@ -147,7 +147,7 @@ func TestReconcileReturnErrorWhenOwnerClusterIsMissing(t *testing.T) { g.Expect(result).To(Equal(ctrl.Result{})) // calling reconcile should return error - g.Expect(env.Delete(ctx, cluster)).To(Succeed()) + g.Expect(env.CleanupAndWait(ctx, cluster)).To(Succeed()) g.Eventually(func() error { _, err := r.Reconcile(ctx, ctrl.Request{NamespacedName: util.ObjectKey(kcp)}) @@ -461,6 +461,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) { cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault) cluster.Spec.ControlPlaneEndpoint.Host = "bar" cluster.Spec.ControlPlaneEndpoint.Port = 6443 + cluster.Status.InfrastructureReady = true kcp.Spec.Version = version fmc := &fakeManagementCluster{ @@ -525,6 +526,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) { cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault) cluster.Spec.ControlPlaneEndpoint.Host = "bar" cluster.Spec.ControlPlaneEndpoint.Port = 6443 + cluster.Status.InfrastructureReady = true kcp.Spec.Version = version fmc := &fakeManagementCluster{ @@ -631,6 +633,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) { cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault) cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com1" cluster.Spec.ControlPlaneEndpoint.Port = 6443 + cluster.Status.InfrastructureReady = true kcp.Spec.Version = version now := metav1.Now() @@ -697,6 +700,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) { cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault) cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com2" cluster.Spec.ControlPlaneEndpoint.Port = 6443 + cluster.Status.InfrastructureReady = true kcp.Spec.Version = "v1.17.0" fmc := &fakeManagementCluster{ diff --git a/controlplane/kubeadm/internal/controllers/scale_test.go b/controlplane/kubeadm/internal/controllers/scale_test.go index bc81bc9d8680..0af849fd1fc1 100644 --- a/controlplane/kubeadm/internal/controllers/scale_test.go +++ b/controlplane/kubeadm/internal/controllers/scale_test.go @@ -129,6 +129,7 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) { initObjs := []client.Object{fakeGenericMachineTemplateCRD, cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy()} cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com" cluster.Spec.ControlPlaneEndpoint.Port = 6443 + cluster.Status.InfrastructureReady = true beforeMachines := collections.New() for i := 0; i < 2; i++ { diff --git a/controlplane/kubeadm/internal/controllers/upgrade_test.go b/controlplane/kubeadm/internal/controllers/upgrade_test.go index 4f78f0247941..a5fe75d7f0ee 100644 --- a/controlplane/kubeadm/internal/controllers/upgrade_test.go +++ b/controlplane/kubeadm/internal/controllers/upgrade_test.go @@ -44,6 +44,7 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleUp(t *testing.T) { cluster, kcp, genericMachineTemplate := createClusterWithControlPlane(metav1.NamespaceDefault) cluster.Spec.ControlPlaneEndpoint.Host = Host cluster.Spec.ControlPlaneEndpoint.Port = 6443 + cluster.Status.InfrastructureReady = true kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil kcp.Spec.Replicas = pointer.Int32Ptr(1) setKCPHealthy(kcp) diff --git a/docs/book/src/developer/providers/machine-infrastructure.md b/docs/book/src/developer/providers/machine-infrastructure.md index 4864639f4e1b..76bb8d8bf017 100644 --- a/docs/book/src/developer/providers/machine-infrastructure.md +++ b/docs/book/src/developer/providers/machine-infrastructure.md @@ -113,6 +113,8 @@ The following diagram shows the typical logic for a machine infrastructure provi 1. If the `Cluster` to which this resource belongs cannot be found, exit the reconciliation 1. Add the provider-specific finalizer, if needed 1. If the associated `Cluster`'s `status.infrastructureReady` is `false`, exit the reconciliation + 1. **Note**: This check should not be blocking any further delete reconciliation flows. + 1. **Note**: This check should only be performed after appropriate owner references (if any) are updated. 1. If the associated `Machine`'s `spec.bootstrap.dataSecretName` is `nil`, exit the reconciliation 1. Reconcile provider-specific machine infrastructure 1. If any errors are encountered: diff --git a/docs/book/src/developer/providers/v1.1-to-v1.2.md b/docs/book/src/developer/providers/v1.1-to-v1.2.md index 16d1a52193b2..00476ef33401 100644 --- a/docs/book/src/developer/providers/v1.1-to-v1.2.md +++ b/docs/book/src/developer/providers/v1.1-to-v1.2.md @@ -81,3 +81,8 @@ in ClusterAPI are kept in sync with the versions used by `sigs.k8s.io/controller - `WaitForControlPlaneAndMachinesReady` - `DiscoveryAndWaitForMachineDeployments` - The `AssertControlPlaneFailureDomains` function in the E2E test framework has been modified to allow proper failure domain testing. + +- After investigating an [issue](https://github.com/kubernetes-sigs/cluster-api/issues/6006) we discovered that improper implementation of a check on `cluster.status.infrastructureReady` can lead to problems during cluster deletion. As a consequence, we recommend that all providers ensure: + - The check for `cluster.status.infrastructureReady=true` usually existing at the beginning of the reconcile loop for control-plane providers is implemented after setting external objects ref; + - The check for `cluster.status.infrastructureReady=true` usually existing at the beginning of the reconcile loop for infrastructure provider does not prevent the object to be deleted +rif. [PR #6183](https://github.com/kubernetes-sigs/cluster-api/pull/6183) diff --git a/test/framework/controlplane_helpers.go b/test/framework/controlplane_helpers.go index fe5b9ea74fd8..af4e62687d3d 100644 --- a/test/framework/controlplane_helpers.go +++ b/test/framework/controlplane_helpers.go @@ -401,12 +401,12 @@ func ScaleAndWaitControlPlane(ctx context.Context, input ScaleAndWaitControlPlan return -1, err } - selectorMap, err := metav1.LabelSelectorAsMap(kcpLabelSelector) + selector, err := metav1.LabelSelectorAsSelector(kcpLabelSelector) if err != nil { return -1, err } machines := &clusterv1.MachineList{} - if err := input.ClusterProxy.GetClient().List(ctx, machines, client.InNamespace(input.ControlPlane.Namespace), client.MatchingLabels(selectorMap)); err != nil { + if err := input.ClusterProxy.GetClient().List(ctx, machines, &client.ListOptions{LabelSelector: selector, Namespace: input.ControlPlane.Namespace}); err != nil { return -1, err } nodeRefCount := 0 diff --git a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go index 2be1bd5c49a6..fd0204b55a6a 100644 --- a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go +++ b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go @@ -135,13 +135,6 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques return ctrl.Result{}, nil } - // Check if the infrastructure is ready, otherwise return and wait for the cluster object to be updated - if !cluster.Status.InfrastructureReady { - log.Info("Waiting for DockerCluster Controller to create cluster infrastructure") - conditions.MarkFalse(dockerMachine, infrav1.ContainerProvisionedCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "") - return ctrl.Result{}, nil - } - // Create a helper for managing the docker container hosting the machine. externalMachine, err := docker.NewMachine(ctx, cluster, machine.Name, nil) if err != nil { @@ -192,6 +185,13 @@ func patchDockerMachine(ctx context.Context, patchHelper *patch.Helper, dockerMa func (r *DockerMachineReconciler) reconcileNormal(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine, dockerMachine *infrav1.DockerMachine, externalMachine *docker.Machine, externalLoadBalancer *docker.LoadBalancer) (res ctrl.Result, retErr error) { log := ctrl.LoggerFrom(ctx) + // Check if the infrastructure is ready, otherwise return and wait for the cluster object to be updated + if !cluster.Status.InfrastructureReady { + log.Info("Waiting for DockerCluster Controller to create cluster infrastructure") + conditions.MarkFalse(dockerMachine, infrav1.ContainerProvisionedCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "") + return ctrl.Result{}, nil + } + // if the machine is already provisioned, return if dockerMachine.Spec.ProviderID != nil { // ensure ready state is set.