Skip to content

Commit

Permalink
prevent blocking of KCP and DockerMachine controllers
Browse files Browse the repository at this point in the history
KCP and DockerMachine controllers are currently blocked if the
infrastrucutre is not ready. This leads to cases where if the
infrstructure is not ready we cannot delete the cluster. This PR fixes
this behavior and prevents blocking the controllers for this case.
  • Loading branch information
Yuvaraj Kakaraparthi committed Mar 7, 2022
1 parent 0195739 commit f73a277
Show file tree
Hide file tree
Showing 8 changed files with 29 additions and 16 deletions.
12 changes: 6 additions & 6 deletions controlplane/kubeadm/internal/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,6 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.
return ctrl.Result{}, nil
}

// Wait for the cluster infrastructure to be ready before creating machines
if !cluster.Status.InfrastructureReady {
log.Info("Cluster infrastructure is not ready yet")
return ctrl.Result{}, nil
}

// Initialize the patch helper.
patchHelper, err := patch.NewHelper(kcp, r.Client)
if err != nil {
Expand Down Expand Up @@ -255,6 +249,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
return ctrl.Result{}, err
}

// Wait for the cluster infrastructure to be ready before creating machines
if !cluster.Status.InfrastructureReady {
log.Info("Cluster infrastructure is not ready yet")
return ctrl.Result{}, nil
}

// Generate Cluster Certificates if needed
config := kcp.Spec.KubeadmConfigSpec.DeepCopy()
config.JoinConfiguration = nil
Expand Down
6 changes: 5 additions & 1 deletion controlplane/kubeadm/internal/controllers/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ func TestReconcileReturnErrorWhenOwnerClusterIsMissing(t *testing.T) {
g.Expect(result).To(Equal(ctrl.Result{}))

// calling reconcile should return error
g.Expect(env.Delete(ctx, cluster)).To(Succeed())
g.Expect(env.CleanupAndWait(ctx, cluster)).To(Succeed())

g.Eventually(func() error {
_, err := r.Reconcile(ctx, ctrl.Request{NamespacedName: util.ObjectKey(kcp)})
Expand Down Expand Up @@ -461,6 +461,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
cluster.Spec.ControlPlaneEndpoint.Host = "bar"
cluster.Spec.ControlPlaneEndpoint.Port = 6443
cluster.Status.InfrastructureReady = true
kcp.Spec.Version = version

fmc := &fakeManagementCluster{
Expand Down Expand Up @@ -525,6 +526,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
cluster.Spec.ControlPlaneEndpoint.Host = "bar"
cluster.Spec.ControlPlaneEndpoint.Port = 6443
cluster.Status.InfrastructureReady = true
kcp.Spec.Version = version

fmc := &fakeManagementCluster{
Expand Down Expand Up @@ -631,6 +633,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com1"
cluster.Spec.ControlPlaneEndpoint.Port = 6443
cluster.Status.InfrastructureReady = true
kcp.Spec.Version = version

now := metav1.Now()
Expand Down Expand Up @@ -697,6 +700,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
cluster, kcp, tmpl := createClusterWithControlPlane(metav1.NamespaceDefault)
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com2"
cluster.Spec.ControlPlaneEndpoint.Port = 6443
cluster.Status.InfrastructureReady = true
kcp.Spec.Version = "v1.17.0"

fmc := &fakeManagementCluster{
Expand Down
1 change: 1 addition & 0 deletions controlplane/kubeadm/internal/controllers/scale_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
initObjs := []client.Object{fakeGenericMachineTemplateCRD, cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy()}
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
cluster.Spec.ControlPlaneEndpoint.Port = 6443
cluster.Status.InfrastructureReady = true

beforeMachines := collections.New()
for i := 0; i < 2; i++ {
Expand Down
1 change: 1 addition & 0 deletions controlplane/kubeadm/internal/controllers/upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleUp(t *testing.T) {
cluster, kcp, genericMachineTemplate := createClusterWithControlPlane(metav1.NamespaceDefault)
cluster.Spec.ControlPlaneEndpoint.Host = Host
cluster.Spec.ControlPlaneEndpoint.Port = 6443
cluster.Status.InfrastructureReady = true
kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil
kcp.Spec.Replicas = pointer.Int32Ptr(1)
setKCPHealthy(kcp)
Expand Down
2 changes: 2 additions & 0 deletions docs/book/src/developer/providers/machine-infrastructure.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ The following diagram shows the typical logic for a machine infrastructure provi
1. If the `Cluster` to which this resource belongs cannot be found, exit the reconciliation
1. Add the provider-specific finalizer, if needed
1. If the associated `Cluster`'s `status.infrastructureReady` is `false`, exit the reconciliation
1. **Note**: This check should not be blocking any further delete reconciliation flows.
1. **Note**: This check should only be performed after appropriate owner references (if any) are updated.
1. If the associated `Machine`'s `spec.bootstrap.dataSecretName` is `nil`, exit the reconciliation
1. Reconcile provider-specific machine infrastructure
1. If any errors are encountered:
Expand Down
5 changes: 5 additions & 0 deletions docs/book/src/developer/providers/v1.1-to-v1.2.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,8 @@ in ClusterAPI are kept in sync with the versions used by `sigs.k8s.io/controller
- `WaitForControlPlaneAndMachinesReady`
- `DiscoveryAndWaitForMachineDeployments`
- The `AssertControlPlaneFailureDomains` function in the E2E test framework has been modified to allow proper failure domain testing.

- After investigating an [issue](https://github.com/kubernetes-sigs/cluster-api/issues/6006) we discovered that improper implementation of a check on `cluster.status.infrastructureReady` can lead to problems during cluster deletion. As a consequence, we recommend that all providers ensure:
- The check for `cluster.status.infrastructureReady=true` usually existing at the beginning of the reconcile loop for control-plane providers is implemented after setting external objects ref;
- The check for `cluster.status.infrastructureReady=true` usually existing at the beginning of the reconcile loop for infrastructure provider does not prevent the object to be deleted
rif. [PR #6183](https://github.com/kubernetes-sigs/cluster-api/pull/6183)
4 changes: 2 additions & 2 deletions test/framework/controlplane_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -401,12 +401,12 @@ func ScaleAndWaitControlPlane(ctx context.Context, input ScaleAndWaitControlPlan
return -1, err
}

selectorMap, err := metav1.LabelSelectorAsMap(kcpLabelSelector)
selector, err := metav1.LabelSelectorAsSelector(kcpLabelSelector)
if err != nil {
return -1, err
}
machines := &clusterv1.MachineList{}
if err := input.ClusterProxy.GetClient().List(ctx, machines, client.InNamespace(input.ControlPlane.Namespace), client.MatchingLabels(selectorMap)); err != nil {
if err := input.ClusterProxy.GetClient().List(ctx, machines, &client.ListOptions{LabelSelector: selector, Namespace: input.ControlPlane.Namespace}); err != nil {
return -1, err
}
nodeRefCount := 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,6 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques
return ctrl.Result{}, nil
}

// Check if the infrastructure is ready, otherwise return and wait for the cluster object to be updated
if !cluster.Status.InfrastructureReady {
log.Info("Waiting for DockerCluster Controller to create cluster infrastructure")
conditions.MarkFalse(dockerMachine, infrav1.ContainerProvisionedCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
return ctrl.Result{}, nil
}

// Create a helper for managing the docker container hosting the machine.
externalMachine, err := docker.NewMachine(ctx, cluster, machine.Name, nil)
if err != nil {
Expand Down Expand Up @@ -192,6 +185,13 @@ func patchDockerMachine(ctx context.Context, patchHelper *patch.Helper, dockerMa
func (r *DockerMachineReconciler) reconcileNormal(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine, dockerMachine *infrav1.DockerMachine, externalMachine *docker.Machine, externalLoadBalancer *docker.LoadBalancer) (res ctrl.Result, retErr error) {
log := ctrl.LoggerFrom(ctx)

// Check if the infrastructure is ready, otherwise return and wait for the cluster object to be updated
if !cluster.Status.InfrastructureReady {
log.Info("Waiting for DockerCluster Controller to create cluster infrastructure")
conditions.MarkFalse(dockerMachine, infrav1.ContainerProvisionedCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
return ctrl.Result{}, nil
}

// if the machine is already provisioned, return
if dockerMachine.Spec.ProviderID != nil {
// ensure ready state is set.
Expand Down

0 comments on commit f73a277

Please sign in to comment.