diff --git a/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_types.go b/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_types.go index 43f536e601ca..9084e60b1193 100644 --- a/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_types.go +++ b/controlplane/kubeadm/api/v1alpha3/kubeadm_control_plane_types.go @@ -25,12 +25,8 @@ import ( ) const ( - KubeadmControlPlaneFinalizer = "kubeadm.controlplane.cluster.x-k8s.io" - KubeadmControlPlaneHashLabelKey = "kubeadm.controlplane.cluster.x-k8s.io/hash" - SelectedForUpgradeAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/selected-for-upgrade" - UpgradeReplacementCreatedAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/upgrade-replacement-created" - DeleteForScaleDownAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/delete-for-scale-down" - ScaleDownConfigMapEntryRemovedAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/scale-down-configmap-entry-removed" + KubeadmControlPlaneFinalizer = "kubeadm.controlplane.cluster.x-k8s.io" + KubeadmControlPlaneHashLabelKey = "kubeadm.controlplane.cluster.x-k8s.io/hash" ) // KubeadmControlPlaneSpec defines the desired state of KubeadmControlPlane. diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go index 494212ef99ee..c2044e3e3807 100644 --- a/controlplane/kubeadm/controllers/controller.go +++ b/controlplane/kubeadm/controllers/controller.go @@ -214,7 +214,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * // Upgrade takes precedence over other operations if len(requireUpgrade) > 0 { logger.Info("Upgrading Control Plane") - return r.upgradeControlPlane(ctx, cluster, kcp, ownedMachines, requireUpgrade, controlPlane) + return r.upgradeControlPlane(ctx, cluster, kcp, controlPlane) } // If we've made it this far, we can assume that all ownedMachines are up to date @@ -231,11 +231,11 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * case numMachines < desiredReplicas && numMachines > 0: // Create a new Machine w/ join logger.Info("Scaling up control plane", "Desired", desiredReplicas, "Existing", numMachines) - return r.scaleUpControlPlane(ctx, cluster, kcp, ownedMachines, controlPlane) + return r.scaleUpControlPlane(ctx, cluster, kcp, controlPlane) // We are scaling down case numMachines > desiredReplicas: logger.Info("Scaling down control plane", "Desired", desiredReplicas, "Existing", numMachines) - return r.scaleDownControlPlane(ctx, cluster, kcp, ownedMachines, ownedMachines, controlPlane) + return r.scaleDownControlPlane(ctx, cluster, kcp, controlPlane) } // Get the workload cluster client. diff --git a/controlplane/kubeadm/controllers/controller_test.go b/controlplane/kubeadm/controllers/controller_test.go index 7b4cdf21b5c8..e97a61554610 100644 --- a/controlplane/kubeadm/controllers/controller_test.go +++ b/controlplane/kubeadm/controllers/controller_test.go @@ -19,7 +19,9 @@ package controllers import ( "context" "fmt" + "sync" "testing" + "time" . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" @@ -920,52 +922,38 @@ func TestKubeadmControlPlaneReconciler_reconcileDelete(t *testing.T) { } -func TestKubeadmControlPlaneReconciler_scaleDownControlPlane(t *testing.T) { - g := NewWithT(t) - - machines := map[string]*clusterv1.Machine{ - "one": machine("one"), - "two": machine("two"), - "three": machine("three"), - } - fd1 := "a" - fd2 := "b" - machines["one"].Spec.FailureDomain = &fd2 - machines["two"].Spec.FailureDomain = &fd1 - machines["three"].Spec.FailureDomain = &fd2 - - r := &KubeadmControlPlaneReconciler{ - Log: log.Log, - recorder: record.NewFakeRecorder(32), - Client: newFakeClient(g, machines["one"]), - managementCluster: &fakeManagementCluster{ - EtcdHealthy: true, - ControlPlaneHealthy: true, - }, - } - cluster := newCluster(&types.NamespacedName{Name: "foo", Namespace: "default"}) - - kcp := &controlplanev1.KubeadmControlPlane{} - controlPlane := &internal.ControlPlane{ - KCP: kcp, - Cluster: cluster, - Machines: machines, - } - - ml := clusterv1.MachineList{} - ml.Items = []clusterv1.Machine{*machines["two"]} - mll := internal.NewFilterableMachineCollectionFromMachineList(&ml) - _, err := r.scaleDownControlPlane(context.Background(), cluster, kcp, machines, mll, controlPlane) - g.Expect(err).To(HaveOccurred()) -} - // test utils func newFakeClient(g *WithT, initObjs ...runtime.Object) client.Client { g.Expect(clusterv1.AddToScheme(scheme.Scheme)).To(Succeed()) g.Expect(bootstrapv1.AddToScheme(scheme.Scheme)).To(Succeed()) g.Expect(controlplanev1.AddToScheme(scheme.Scheme)).To(Succeed()) - return fake.NewFakeClientWithScheme(scheme.Scheme, initObjs...) + return &fakeClient{ + startTime: time.Now(), + Client: fake.NewFakeClientWithScheme(scheme.Scheme, initObjs...), + } +} + +type fakeClient struct { + startTime time.Time + mux sync.Mutex + client.Client +} + +type fakeClientI interface { + SetCreationTimestamp(timestamp metav1.Time) +} + +// controller-runtime's fake client doesn't set a CreationTimestamp +// this sets one that increments by a minute for each object created +func (c *fakeClient) Create(ctx context.Context, obj runtime.Object, opts ...client.CreateOption) error { + if f, ok := obj.(fakeClientI); ok { + c.mux.Lock() + c.startTime = c.startTime.Add(time.Minute) + f.SetCreationTimestamp(metav1.NewTime(c.startTime)) + c.mux.Unlock() + } + return c.Client.Create(ctx, obj, opts...) } func createClusterWithControlPlane() (*clusterv1.Cluster, *controlplanev1.KubeadmControlPlane, *unstructured.Unstructured) { diff --git a/controlplane/kubeadm/controllers/helpers.go b/controlplane/kubeadm/controllers/helpers.go index ea40deca35ea..2ddfde3cf237 100644 --- a/controlplane/kubeadm/controllers/helpers.go +++ b/controlplane/kubeadm/controllers/helpers.go @@ -229,23 +229,3 @@ func (r *KubeadmControlPlaneReconciler) generateMachine(ctx context.Context, kcp } return nil } - -func (r *KubeadmControlPlaneReconciler) markWithAnnotationKey(ctx context.Context, machine *clusterv1.Machine, annotationKey string) error { - if machine == nil { - return errors.New("expected machine not nil") - } - patchHelper, err := patch.NewHelper(machine, r.Client) - if err != nil { - return errors.Wrapf(err, "failed to create patch helper for machine %s", machine.Name) - } - - if machine.Annotations == nil { - machine.Annotations = make(map[string]string) - } - machine.Annotations[annotationKey] = "" - - if err := patchHelper.Patch(ctx, machine); err != nil { - return errors.Wrapf(err, "failed to patch machine %s selected for upgrade", machine.Name) - } - return nil -} diff --git a/controlplane/kubeadm/controllers/scale.go b/controlplane/kubeadm/controllers/scale.go index d0ff0e081b02..cc80fe093f15 100644 --- a/controlplane/kubeadm/controllers/scale.go +++ b/controlplane/kubeadm/controllers/scale.go @@ -25,7 +25,6 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" - "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters" capierrors "sigs.k8s.io/cluster-api/errors" "sigs.k8s.io/cluster-api/util" ctrl "sigs.k8s.io/controller-runtime" @@ -46,7 +45,7 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte return ctrl.Result{Requeue: true}, nil } -func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, _ internal.FilterableMachineCollection, controlPlane *internal.ControlPlane) (ctrl.Result, error) { +func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) { logger := controlPlane.Logger() if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil { @@ -78,8 +77,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, - ownedMachines internal.FilterableMachineCollection, - selectedMachines internal.FilterableMachineCollection, controlPlane *internal.ControlPlane, ) (ctrl.Result, error) { logger := controlPlane.Logger() @@ -97,17 +94,11 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: deleteRequeueAfter} } - // If there is not already a Machine that is marked for scale down, find one and mark it - markedForDeletion := selectedMachines.Filter(machinefilters.HasAnnotationKey(controlplanev1.DeleteForScaleDownAnnotation)) - if len(markedForDeletion) == 0 { - machineToMark, err := r.selectAndMarkMachine(ctx, selectedMachines, controlplanev1.DeleteForScaleDownAnnotation, controlPlane) - if err != nil { - return ctrl.Result{}, errors.Wrap(err, "failed to select and mark machine for scale down") - } - markedForDeletion.Insert(machineToMark) + machineToDelete, err := selectMachineForScaleDown(controlPlane) + if err != nil { + return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down") } - machineToDelete := markedForDeletion.Oldest() if machineToDelete == nil { logger.Info("Failed to pick control plane Machine to delete") return ctrl.Result{}, errors.New("failed to pick control plane Machine to delete") @@ -121,7 +112,7 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter} } // If etcd leadership is on machine that is about to be deleted, move it to the newest member available. - etcdLeaderCandidate := ownedMachines.Newest() + etcdLeaderCandidate := controlPlane.Machines.Newest() if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToDelete, etcdLeaderCandidate); err != nil { logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name) return ctrl.Result{}, err @@ -131,21 +122,16 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( return ctrl.Result{}, err } - if !machinefilters.HasAnnotationKey(controlplanev1.ScaleDownConfigMapEntryRemovedAnnotation)(machineToDelete) { - if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil { - logger.V(2).Info("Waiting for control plane to pass control plane health check before removing a control plane machine", "cause", err) - r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", - "Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err) - return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter} + if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil { + logger.V(2).Info("Waiting for control plane to pass control plane health check before removing a control plane machine", "cause", err) + r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", + "Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err) + return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter} - } - if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToDelete); err != nil { - logger.Error(err, "Failed to remove machine from kubeadm ConfigMap") - return ctrl.Result{}, err - } - if err := r.markWithAnnotationKey(ctx, machineToDelete, controlplanev1.ScaleDownConfigMapEntryRemovedAnnotation); err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to mark machine %s as having config map entry removed", machineToDelete.Name) - } + } + if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToDelete); err != nil { + logger.Error(err, "Failed to remove machine from kubeadm ConfigMap") + return ctrl.Result{}, err } // Do a final health check of the Control Plane components prior to actually deleting the machine @@ -166,3 +152,11 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( // Requeue the control plane, in case there are additional operations to perform return ctrl.Result{Requeue: true}, nil } + +func selectMachineForScaleDown(controlPlane *internal.ControlPlane) (*clusterv1.Machine, error) { + machines := controlPlane.Machines + if needingUpgrade := controlPlane.MachinesNeedingUpgrade(); needingUpgrade.Len() > 0 { + machines = needingUpgrade + } + return controlPlane.MachineInFailureDomainWithMostMachines(machines) +} diff --git a/controlplane/kubeadm/controllers/scale_test.go b/controlplane/kubeadm/controllers/scale_test.go index 9aa5bd3962ea..088d3716798c 100644 --- a/controlplane/kubeadm/controllers/scale_test.go +++ b/controlplane/kubeadm/controllers/scale_test.go @@ -20,15 +20,19 @@ import ( "context" "fmt" "testing" + "time" . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/record" clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1alpha3" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" + "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/hash" capierrors "sigs.k8s.io/cluster-api/errors" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -108,7 +112,7 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) { Machines: fmc.Machines, } - result, err := r.scaleUpControlPlane(context.Background(), cluster, kcp, fmc.Machines.DeepCopy(), controlPlane) + result, err := r.scaleUpControlPlane(context.Background(), cluster, kcp, controlPlane) g.Expect(result).To(Equal(ctrl.Result{Requeue: true})) g.Expect(err).ToNot(HaveOccurred()) @@ -163,8 +167,7 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) { Machines: beforeMachines, } - ownedMachines := fmc.Machines.DeepCopy() - _, err := r.scaleUpControlPlane(context.Background(), cluster.DeepCopy(), kcp.DeepCopy(), ownedMachines, controlPlane) + _, err := r.scaleUpControlPlane(context.Background(), cluster.DeepCopy(), kcp.DeepCopy(), controlPlane) g.Expect(err).To(HaveOccurred()) g.Expect(err).To(MatchError(&capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter})) @@ -206,6 +209,105 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. Machines: machines, } - _, err := r.scaleDownControlPlane(context.Background(), cluster, kcp, machines, machines, controlPlane) + _, err := r.scaleDownControlPlane(context.Background(), cluster, kcp, controlPlane) g.Expect(err).ToNot(HaveOccurred()) } + +func TestSelectMachineForScaleDown(t *testing.T) { + kcp := controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{}, + } + startDate := time.Date(2000, 1, 1, 1, 0, 0, 0, time.UTC) + m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour)), withValidHash(kcp.Spec)) + m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour)), withValidHash(kcp.Spec)) + m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour)), withValidHash(kcp.Spec)) + m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour)), withValidHash(kcp.Spec)) + m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)), withHash("shrug")) + + mc3 := internal.NewFilterableMachineCollection(m1, m2, m3, m4, m5) + fd := clusterv1.FailureDomains{ + "one": failureDomain(true), + "two": failureDomain(true), + } + + needsUpgradeControlPlane := &internal.ControlPlane{ + KCP: &kcp, + Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}}, + Machines: mc3, + } + upToDateControlPlane := &internal.ControlPlane{ + KCP: &kcp, + Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}}, + Machines: mc3.Filter(func(m *clusterv1.Machine) bool { + return m.Name != "machine-5" + }), + } + + testCases := []struct { + name string + cp *internal.ControlPlane + expectErr bool + expectedMachine clusterv1.Machine + }{ + { + name: "when there are are machines needing upgrade, it returns the oldest machine in the failure domain with the most machines needing upgrade", + cp: needsUpgradeControlPlane, + expectErr: false, + expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-5"}}, + }, + { + name: "when there are no outdated machines, it returns the oldest machine in the largest failure domain", + cp: upToDateControlPlane, + expectErr: false, + expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-3"}}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + + g.Expect(clusterv1.AddToScheme(scheme.Scheme)).To(Succeed()) + + selectedMachine, err := selectMachineForScaleDown(tc.cp) + + if tc.expectErr { + g.Expect(err).To(HaveOccurred()) + return + } + + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(tc.expectedMachine.Name).To(Equal(selectedMachine.Name)) + }) + } +} + +func failureDomain(controlPlane bool) clusterv1.FailureDomainSpec { + return clusterv1.FailureDomainSpec{ + ControlPlane: controlPlane, + } +} + +func withFailureDomain(fd string) machineOpt { + return func(m *clusterv1.Machine) { + m.Spec.FailureDomain = &fd + } +} + +func withTimestamp(t time.Time) machineOpt { + return func(m *clusterv1.Machine) { + m.CreationTimestamp = metav1.NewTime(t) + } +} + +func withValidHash(kcp controlplanev1.KubeadmControlPlaneSpec) machineOpt { + return func(m *clusterv1.Machine) { + withHash(hash.Compute(&kcp))(m) + } +} + +func withHash(hash string) machineOpt { + return func(m *clusterv1.Machine) { + m.SetLabels(map[string]string{controlplanev1.KubeadmControlPlaneHashLabelKey: hash}) + } +} diff --git a/controlplane/kubeadm/controllers/upgrade.go b/controlplane/kubeadm/controllers/upgrade.go index 8037c9c7263c..cca3fd980304 100644 --- a/controlplane/kubeadm/controllers/upgrade.go +++ b/controlplane/kubeadm/controllers/upgrade.go @@ -24,7 +24,6 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" - "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters" "sigs.k8s.io/cluster-api/util" ctrl "sigs.k8s.io/controller-runtime" ) @@ -33,8 +32,6 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane( ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, - ownedMachines internal.FilterableMachineCollection, - requireUpgrade internal.FilterableMachineCollection, controlPlane *internal.ControlPlane, ) (ctrl.Result, error) { logger := controlPlane.Logger() @@ -82,44 +79,14 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane( return ctrl.Result{}, errors.Wrap(err, "failed to upgrade kubelet config map") } - // If there is not already a Machine that is marked for upgrade, find one and mark it - selectedForUpgrade := requireUpgrade.Filter(machinefilters.HasAnnotationKey(controlplanev1.SelectedForUpgradeAnnotation)) - if len(selectedForUpgrade) == 0 { - selectedMachine, err := r.selectAndMarkMachine(ctx, requireUpgrade, controlplanev1.SelectedForUpgradeAnnotation, controlPlane) - if err != nil { - logger.Error(err, "failed to select machine for upgrade") - return ctrl.Result{}, err - } - selectedForUpgrade = selectedForUpgrade.Insert(selectedMachine) - } - - replacementCreated := selectedForUpgrade.Filter(machinefilters.HasAnnotationKey(controlplanev1.UpgradeReplacementCreatedAnnotation)) - if len(replacementCreated) == 0 { - // TODO: should we also add a check here to ensure that current machines not > kcp.spec.replicas+1? - // We haven't created a replacement machine for the cluster yet - // return here to avoid blocking while waiting for the new control plane Machine to come up - result, err := r.scaleUpControlPlane(ctx, cluster, kcp, ownedMachines, controlPlane) - if err != nil { - return ctrl.Result{}, err - } - if err := r.markWithAnnotationKey(ctx, selectedForUpgrade.Oldest(), controlplanev1.UpgradeReplacementCreatedAnnotation); err != nil { - return ctrl.Result{}, err - } - return result, nil - } - - return r.scaleDownControlPlane(ctx, cluster, kcp, ownedMachines, replacementCreated, controlPlane) -} - -func (r *KubeadmControlPlaneReconciler) selectAndMarkMachine(ctx context.Context, machines internal.FilterableMachineCollection, annotation string, controlPlane *internal.ControlPlane) (*clusterv1.Machine, error) { - selected, err := controlPlane.MachineInFailureDomainWithMostMachines(machines) + status, err := workloadCluster.ClusterStatus(ctx) if err != nil { - return nil, errors.Wrap(err, "failed to select and mark a machine") - } - //annotation - if err := r.markWithAnnotationKey(ctx, selected, annotation); err != nil { - return nil, errors.Wrap(err, "failed to select and mark a machine") + return ctrl.Result{}, err } - return selected, nil + if status.Nodes <= *kcp.Spec.Replicas { + // scaleUp ensures that we don't continue scaling up while waiting for Machines to have NodeRefs + return r.scaleUpControlPlane(ctx, cluster, kcp, controlPlane) + } + return r.scaleDownControlPlane(ctx, cluster, kcp, controlPlane) } diff --git a/controlplane/kubeadm/controllers/upgrade_test.go b/controlplane/kubeadm/controllers/upgrade_test.go index b80ce306c6cb..5136054e5a28 100644 --- a/controlplane/kubeadm/controllers/upgrade_test.go +++ b/controlplane/kubeadm/controllers/upgrade_test.go @@ -19,15 +19,13 @@ package controllers import ( "context" "testing" - "time" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/record" + "k8s.io/utils/pointer" clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" - controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" capierrors "sigs.k8s.io/cluster-api/errors" ctrl "sigs.k8s.io/controller-runtime" @@ -41,6 +39,7 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) { cluster, kcp, genericMachineTemplate := createClusterWithControlPlane() kcp.Spec.Version = "v1.17.3" kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil + kcp.Spec.Replicas = pointer.Int32Ptr(1) fakeClient := newFakeClient(g, cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy()) @@ -49,8 +48,10 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) { Log: log.Log, recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Management: &internal.Management{Client: fakeClient}, - Workload: fakeWorkloadCluster{}, + Management: &internal.Management{Client: fakeClient}, + Workload: fakeWorkloadCluster{Status: internal.ClusterStatus{Nodes: 1}}, + ControlPlaneHealthy: true, + EtcdHealthy: true, }, } controlPlane := &internal.ControlPlane{ @@ -63,158 +64,52 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) { g.Expect(result).To(Equal(ctrl.Result{Requeue: true})) g.Expect(err).NotTo(HaveOccurred()) - machineList := &clusterv1.MachineList{} - g.Expect(fakeClient.List(context.Background(), machineList, client.InNamespace(cluster.Namespace))).To(Succeed()) - g.Expect(machineList.Items).NotTo(BeEmpty()) - g.Expect(machineList.Items).To(HaveLen(1)) + // initial setup + initialMachine := &clusterv1.MachineList{} + g.Expect(fakeClient.List(context.Background(), initialMachine, client.InNamespace(cluster.Namespace))).To(Succeed()) + g.Expect(initialMachine.Items).To(HaveLen(1)) - machineCollection := internal.NewFilterableMachineCollection(&machineList.Items[0]) - result, err = r.upgradeControlPlane(context.Background(), cluster, kcp, machineCollection, machineCollection, controlPlane) + // change the KCP spec so the machine becomes outdated + kcp.Spec.Version = "v1.17.4" - g.Expect(machineList.Items[0].Annotations).To(HaveKey(controlplanev1.SelectedForUpgradeAnnotation)) - - // TODO flesh out the rest of this test - this is currently least-effort to confirm a fix for an NPE when updating - // the etcd version - g.Expect(result).To(Equal(ctrl.Result{})) + // run upgrade the first time, expect we scale up + needingUpgrade := internal.NewFilterableMachineCollectionFromMachineList(initialMachine) + controlPlane.Machines = needingUpgrade + result, err = r.upgradeControlPlane(context.Background(), cluster, kcp, controlPlane) + g.Expect(result).To(Equal(ctrl.Result{Requeue: true})) + g.Expect(err).To(BeNil()) + bothMachines := &clusterv1.MachineList{} + g.Expect(fakeClient.List(context.Background(), bothMachines, client.InNamespace(cluster.Namespace))).To(Succeed()) + g.Expect(bothMachines.Items).To(HaveLen(2)) + + // run upgrade a second time, simulate that the node has not appeared yet but the machine exists + r.managementCluster.(*fakeManagementCluster).ControlPlaneHealthy = false + _, err = r.upgradeControlPlane(context.Background(), cluster, kcp, controlPlane) g.Expect(err).To(Equal(&capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter})) -} - -func TestSelectMachineForUpgrade(t *testing.T) { - g := NewWithT(t) - - cluster, kcp, genericMachineTemplate := createClusterWithControlPlane() - kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil - - m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(metav1.Time{Time: time.Date(1, 0, 0, 0, 0, 0, 0, time.UTC)})) - m2 := machine("machine-2", withFailureDomain("two"), withTimestamp(metav1.Time{Time: time.Date(2, 0, 0, 0, 0, 0, 0, time.UTC)})) - m3 := machine("machine-3", withFailureDomain("two"), withTimestamp(metav1.Time{Time: time.Date(3, 0, 0, 0, 0, 0, 0, time.UTC)})) - m4 := machine("machine-4", withFailureDomain("four")) - - mc1 := internal.FilterableMachineCollection{"machine-1": m1} - mc2 := internal.FilterableMachineCollection{ - "machine-1": m1, - "machine-2": m2, - } - mc3 := internal.FilterableMachineCollection{ - "machine-1": m1, - "machine-2": m2, - "machine-3": m3, - } - fd := clusterv1.FailureDomains{ - "one": failureDomain(true), - "two": failureDomain(true), - "three": failureDomain(false), - } - - controlPlane3Machines := &internal.ControlPlane{ - KCP: &controlplanev1.KubeadmControlPlane{}, - Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}}, - Machines: mc3, - } - - fakeClient := newFakeClient( - g, - cluster.DeepCopy(), - kcp.DeepCopy(), - genericMachineTemplate.DeepCopy(), - m1.DeepCopy(), - m2.DeepCopy(), - m4.DeepCopy(), - ) - - r := &KubeadmControlPlaneReconciler{ - Client: fakeClient, - Log: log.Log, - recorder: record.NewFakeRecorder(32), - managementCluster: &fakeManagementCluster{ - Management: &internal.Management{Client: fakeClient}, - Workload: fakeWorkloadCluster{}, - }, - } - - testCases := []struct { - name string - upgradeMachines internal.FilterableMachineCollection - cp *internal.ControlPlane - expectErr bool - expectedMachine clusterv1.Machine - }{ - { - name: "When controlplane and upgrade machines are same, picks the oldest failure domain with most machines ", - upgradeMachines: mc3, - cp: controlPlane3Machines, - expectErr: false, - expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-2"}}, - }, - { - name: "Picks the upgrade machine even if it does not belong to the fd with most machines", - upgradeMachines: mc1, - cp: controlPlane3Machines, - expectErr: false, - expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-1"}}, - }, - { - name: "Picks the upgrade machine that belongs to the fd with most machines", - upgradeMachines: mc2, - cp: controlPlane3Machines, - expectErr: false, - expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-2"}}, - }, - { - name: "Picks the upgrade machine that is not in existing failure domains", - upgradeMachines: internal.FilterableMachineCollection{"machine-4": m4}, - cp: controlPlane3Machines, - expectErr: false, - expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-4"}}, - }, - { - name: "Marking machine with annotation fails", - upgradeMachines: internal.FilterableMachineCollection{"machine-3": m3}, - cp: controlPlane3Machines, - expectErr: true, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - g := NewWithT(t) - - g.Expect(clusterv1.AddToScheme(scheme.Scheme)).To(Succeed()) + g.Expect(fakeClient.List(context.Background(), bothMachines, client.InNamespace(cluster.Namespace))).To(Succeed()) + g.Expect(bothMachines.Items).To(HaveLen(2)) - selectedMachine, err := r.selectAndMarkMachine(context.Background(), tc.upgradeMachines, controlplanev1.SelectedForUpgradeAnnotation, tc.cp) + controlPlane.Machines = internal.NewFilterableMachineCollectionFromMachineList(bothMachines) - if tc.expectErr { - g.Expect(err).To(HaveOccurred()) - return - } + // manually increase number of nodes, make control plane healthy again + r.managementCluster.(*fakeManagementCluster).Workload.Status.Nodes++ + r.managementCluster.(*fakeManagementCluster).ControlPlaneHealthy = true - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(selectedMachine.Name).To(Equal(tc.expectedMachine.Name)) - }) - } - -} + // run upgrade the second time, expect we scale down + result, err = r.upgradeControlPlane(context.Background(), cluster, kcp, controlPlane) + g.Expect(err).To(BeNil()) + g.Expect(result).To(Equal(ctrl.Result{Requeue: true})) + finalMachine := &clusterv1.MachineList{} + g.Expect(fakeClient.List(context.Background(), finalMachine, client.InNamespace(cluster.Namespace))).To(Succeed()) + g.Expect(finalMachine.Items).To(HaveLen(1)) -func failureDomain(controlPlane bool) clusterv1.FailureDomainSpec { - return clusterv1.FailureDomainSpec{ - ControlPlane: controlPlane, - } + // assert that the deleted machine is the oldest, initial machine + g.Expect(finalMachine.Items[0].Name).ToNot(Equal(initialMachine.Items[0].Name)) + g.Expect(finalMachine.Items[0].CreationTimestamp.Time).To(BeTemporally(">", initialMachine.Items[0].CreationTimestamp.Time)) } type machineOpt func(*clusterv1.Machine) -func withFailureDomain(fd string) machineOpt { - return func(m *clusterv1.Machine) { - m.Spec.FailureDomain = &fd - } -} - -func withTimestamp(t metav1.Time) machineOpt { - return func(m *clusterv1.Machine) { - m.CreationTimestamp = t - } -} - func machine(name string, opts ...machineOpt) *clusterv1.Machine { m := &clusterv1.Machine{ ObjectMeta: metav1.ObjectMeta{ diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index e14da9ed117b..b87bb3ec478a 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -48,7 +48,7 @@ func NewControlPlane(cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmCont // Logger returns a logger with useful context. func (c *ControlPlane) Logger() logr.Logger { - return Log.WithValues("namespace", c.KCP.Namespace, "name", c.KCP.Name, "cluster-nanme", c.Cluster.Name) + return Log.WithValues("namespace", c.KCP.Namespace, "name", c.KCP.Name, "cluster-name", c.Cluster.Name) } // FailureDomains returns a slice of failure domain objects synced from the infrastructure provider into Cluster.Status.