From 47a9925aa88e168e76b436fc341c42a31f0ebd61 Mon Sep 17 00:00:00 2001 From: Sedef Date: Tue, 31 Mar 2020 10:11:34 -0700 Subject: [PATCH] [KCP] Fix for selecting failure domain logic for ugrade --- controlplane/kubeadm/controllers/scale.go | 14 ++--- controlplane/kubeadm/controllers/upgrade.go | 18 +++--- .../kubeadm/controllers/upgrade_test.go | 62 ++++++++++++++----- .../kubeadm/internal/control_plane.go | 19 ++++-- .../kubeadm/internal/failure_domain.go | 24 +++++-- .../kubeadm/internal/failure_domain_test.go | 34 ++++++---- 6 files changed, 119 insertions(+), 52 deletions(-) diff --git a/controlplane/kubeadm/controllers/scale.go b/controlplane/kubeadm/controllers/scale.go index 0c42607b8b44..d0ff0e081b02 100644 --- a/controlplane/kubeadm/controllers/scale.go +++ b/controlplane/kubeadm/controllers/scale.go @@ -97,18 +97,14 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: deleteRequeueAfter} } + // If there is not already a Machine that is marked for scale down, find one and mark it markedForDeletion := selectedMachines.Filter(machinefilters.HasAnnotationKey(controlplanev1.DeleteForScaleDownAnnotation)) if len(markedForDeletion) == 0 { - fd := controlPlane.FailureDomainWithMostMachines(selectedMachines) - machinesInFailureDomain := selectedMachines.Filter(machinefilters.InFailureDomains(fd)) - machineToMark := machinesInFailureDomain.Oldest() - if machineToMark == nil { - return ctrl.Result{}, errors.New("failed to pick control plane Machine to mark for deletion") + machineToMark, err := r.selectAndMarkMachine(ctx, selectedMachines, controlplanev1.DeleteForScaleDownAnnotation, controlPlane) + if err != nil { + return ctrl.Result{}, errors.Wrap(err, "failed to select and mark machine for scale down") } - if err := r.markWithAnnotationKey(ctx, machineToMark, controlplanev1.DeleteForScaleDownAnnotation); err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to mark machine %s for deletion", machineToMark.Name) - } - markedForDeletion.Insert(machinesInFailureDomain.Oldest()) + markedForDeletion.Insert(machineToMark) } machineToDelete := markedForDeletion.Oldest() diff --git a/controlplane/kubeadm/controllers/upgrade.go b/controlplane/kubeadm/controllers/upgrade.go index a1d25411ac6f..8037c9c7263c 100644 --- a/controlplane/kubeadm/controllers/upgrade.go +++ b/controlplane/kubeadm/controllers/upgrade.go @@ -85,7 +85,7 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane( // If there is not already a Machine that is marked for upgrade, find one and mark it selectedForUpgrade := requireUpgrade.Filter(machinefilters.HasAnnotationKey(controlplanev1.SelectedForUpgradeAnnotation)) if len(selectedForUpgrade) == 0 { - selectedMachine, err := r.selectMachineForUpgrade(ctx, cluster, requireUpgrade, controlPlane) + selectedMachine, err := r.selectAndMarkMachine(ctx, requireUpgrade, controlplanev1.SelectedForUpgradeAnnotation, controlPlane) if err != nil { logger.Error(err, "failed to select machine for upgrade") return ctrl.Result{}, err @@ -111,14 +111,14 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane( return r.scaleDownControlPlane(ctx, cluster, kcp, ownedMachines, replacementCreated, controlPlane) } -func (r *KubeadmControlPlaneReconciler) selectMachineForUpgrade(ctx context.Context, _ *clusterv1.Cluster, requireUpgrade internal.FilterableMachineCollection, controlPlane *internal.ControlPlane) (*clusterv1.Machine, error) { - failureDomain := controlPlane.FailureDomainWithMostMachines(requireUpgrade) - - inFailureDomain := requireUpgrade.Filter(machinefilters.InFailureDomains(failureDomain)) - selected := inFailureDomain.Oldest() - - if err := r.markWithAnnotationKey(ctx, selected, controlplanev1.SelectedForUpgradeAnnotation); err != nil { - return nil, errors.Wrap(err, "failed to select and mark a machine for upgrade") +func (r *KubeadmControlPlaneReconciler) selectAndMarkMachine(ctx context.Context, machines internal.FilterableMachineCollection, annotation string, controlPlane *internal.ControlPlane) (*clusterv1.Machine, error) { + selected, err := controlPlane.MachineInFailureDomainWithMostMachines(machines) + if err != nil { + return nil, errors.Wrap(err, "failed to select and mark a machine") + } + //annotation + if err := r.markWithAnnotationKey(ctx, selected, annotation); err != nil { + return nil, errors.Wrap(err, "failed to select and mark a machine") } return selected, nil diff --git a/controlplane/kubeadm/controllers/upgrade_test.go b/controlplane/kubeadm/controllers/upgrade_test.go index 478756eef37c..b80ce306c6cb 100644 --- a/controlplane/kubeadm/controllers/upgrade_test.go +++ b/controlplane/kubeadm/controllers/upgrade_test.go @@ -85,26 +85,31 @@ func TestSelectMachineForUpgrade(t *testing.T) { cluster, kcp, genericMachineTemplate := createClusterWithControlPlane() kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil - m1 := machine("machine-1", withFailureDomain("one")) - m2 := machine("machine-2", withFailureDomain("two"), withTimestamp(metav1.Time{Time: time.Date(1, 0, 0, 0, 0, 0, 0, time.UTC)})) - m3 := machine("machine-3", withFailureDomain("two"), withTimestamp(metav1.Time{Time: time.Date(2, 0, 0, 0, 0, 0, 0, time.UTC)})) + m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(metav1.Time{Time: time.Date(1, 0, 0, 0, 0, 0, 0, time.UTC)})) + m2 := machine("machine-2", withFailureDomain("two"), withTimestamp(metav1.Time{Time: time.Date(2, 0, 0, 0, 0, 0, 0, time.UTC)})) + m3 := machine("machine-3", withFailureDomain("two"), withTimestamp(metav1.Time{Time: time.Date(3, 0, 0, 0, 0, 0, 0, time.UTC)})) + m4 := machine("machine-4", withFailureDomain("four")) - mc1 := internal.FilterableMachineCollection{ + mc1 := internal.FilterableMachineCollection{"machine-1": m1} + mc2 := internal.FilterableMachineCollection{ + "machine-1": m1, + "machine-2": m2, + } + mc3 := internal.FilterableMachineCollection{ "machine-1": m1, "machine-2": m2, "machine-3": m3, } - fd1 := clusterv1.FailureDomains{ + fd := clusterv1.FailureDomains{ "one": failureDomain(true), "two": failureDomain(true), - "three": failureDomain(true), - "four": failureDomain(false), + "three": failureDomain(false), } - controlPlane := &internal.ControlPlane{ + controlPlane3Machines := &internal.ControlPlane{ KCP: &controlplanev1.KubeadmControlPlane{}, - Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd1}}, - Machines: mc1, + Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}}, + Machines: mc3, } fakeClient := newFakeClient( @@ -112,7 +117,9 @@ func TestSelectMachineForUpgrade(t *testing.T) { cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy(), + m1.DeepCopy(), m2.DeepCopy(), + m4.DeepCopy(), ) r := &KubeadmControlPlaneReconciler{ @@ -128,17 +135,44 @@ func TestSelectMachineForUpgrade(t *testing.T) { testCases := []struct { name string upgradeMachines internal.FilterableMachineCollection - cpMachines internal.FilterableMachineCollection + cp *internal.ControlPlane expectErr bool expectedMachine clusterv1.Machine }{ { - name: "matching controlplane machines and upgrade machines", + name: "When controlplane and upgrade machines are same, picks the oldest failure domain with most machines ", + upgradeMachines: mc3, + cp: controlPlane3Machines, + expectErr: false, + expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-2"}}, + }, + { + name: "Picks the upgrade machine even if it does not belong to the fd with most machines", upgradeMachines: mc1, - cpMachines: controlPlane.Machines, + cp: controlPlane3Machines, + expectErr: false, + expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-1"}}, + }, + { + name: "Picks the upgrade machine that belongs to the fd with most machines", + upgradeMachines: mc2, + cp: controlPlane3Machines, expectErr: false, expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-2"}}, }, + { + name: "Picks the upgrade machine that is not in existing failure domains", + upgradeMachines: internal.FilterableMachineCollection{"machine-4": m4}, + cp: controlPlane3Machines, + expectErr: false, + expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-4"}}, + }, + { + name: "Marking machine with annotation fails", + upgradeMachines: internal.FilterableMachineCollection{"machine-3": m3}, + cp: controlPlane3Machines, + expectErr: true, + }, } for _, tc := range testCases { @@ -147,7 +181,7 @@ func TestSelectMachineForUpgrade(t *testing.T) { g.Expect(clusterv1.AddToScheme(scheme.Scheme)).To(Succeed()) - selectedMachine, err := r.selectMachineForUpgrade(context.Background(), cluster, controlPlane.Machines, controlPlane) + selectedMachine, err := r.selectAndMarkMachine(context.Background(), tc.upgradeMachines, controlplanev1.SelectedForUpgradeAnnotation, tc.cp) if tc.expectErr { g.Expect(err).To(HaveOccurred()) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 08ca28bdcd24..e14da9ed117b 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -18,6 +18,7 @@ package internal import ( "github.com/go-logr/logr" + "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apiserver/pkg/storage/names" @@ -107,8 +108,19 @@ func (c *ControlPlane) MachinesNeedingUpgrade() FilterableMachineCollection { ) } -// FailureDomainWithMostMachines returns the failure domain with the most number of machines. -// Used when scaling down. +// MachineInFailureDomainWithMostMachines returns the first matching failure domain with machines that has the most control-plane machines on it. +func (c *ControlPlane) MachineInFailureDomainWithMostMachines(machines FilterableMachineCollection) (*clusterv1.Machine, error) { + fd := c.FailureDomainWithMostMachines(machines) + machinesInFailureDomain := machines.Filter(machinefilters.InFailureDomains(fd)) + machineToMark := machinesInFailureDomain.Oldest() + if machineToMark == nil { + return nil, errors.New("failed to pick control plane Machine to mark for deletion") + } + return machineToMark, nil +} + +// FailureDomainWithMostMachines returns a fd which exists both in machines and control-plane machines and has the most +// control-plane machines on it. func (c *ControlPlane) FailureDomainWithMostMachines(machines FilterableMachineCollection) *string { // See if there are any Machines that are not in currently defined failure domains first. notInFailureDomains := machines.Filter( @@ -121,8 +133,7 @@ func (c *ControlPlane) FailureDomainWithMostMachines(machines FilterableMachineC return notInFailureDomains.Oldest().Spec.FailureDomain } - // Otherwise pick the currently known failure domain with the most Machines - return PickMost(c.Cluster.Status.FailureDomains.FilterControlPlane(), machines) + return PickMost(c, machines) } // FailureDomainWithFewestMachines returns the failure domain with the fewest number of machines. diff --git a/controlplane/kubeadm/internal/failure_domain.go b/controlplane/kubeadm/internal/failure_domain.go index 99c7c69aab8c..efc55efafd5e 100644 --- a/controlplane/kubeadm/internal/failure_domain.go +++ b/controlplane/kubeadm/internal/failure_domain.go @@ -50,15 +50,31 @@ func (f failureDomainAggregations) Swap(i, j int) { f[i], f[j] = f[j], f[i] } -// PickMost returns the failure domain with the most number of machines. -func PickMost(failureDomains clusterv1.FailureDomains, machines FilterableMachineCollection) *string { +// PickMost returns a failure domain that is in machines and has most control-plane machines on. +func PickMost(c *ControlPlane, machines FilterableMachineCollection) *string { + // orderDescending sorts failure domains according to all control plane machines + fds := orderDescending(c.Cluster.Status.FailureDomains.FilterControlPlane(), c.Machines) + for _, fd := range fds { + for _, m := range machines { + if m.Spec.FailureDomain == nil { + continue + } + if *m.Spec.FailureDomain == fd.id { + return &fd.id + } + } + } + return nil +} + +// orderDescending returns the sorted failure domains in decreasing order. +func orderDescending(failureDomains clusterv1.FailureDomains, machines FilterableMachineCollection) failureDomainAggregations { aggregations := pick(failureDomains, machines) if len(aggregations) == 0 { return nil } sort.Sort(sort.Reverse(aggregations)) - return pointer.StringPtr(aggregations[0].id) - + return aggregations } // PickFewest returns the failure domain with the fewest number of machines. diff --git a/controlplane/kubeadm/internal/failure_domain_test.go b/controlplane/kubeadm/internal/failure_domain_test.go index 83e5f52d7877..497b3a011adf 100644 --- a/controlplane/kubeadm/internal/failure_domain_test.go +++ b/controlplane/kubeadm/internal/failure_domain_test.go @@ -102,8 +102,8 @@ func TestNewFailureDomainPickMost(t *testing.T) { b := pointer.StringPtr("us-west-1b") fds := clusterv1.FailureDomains{ - *a: clusterv1.FailureDomainSpec{}, - *b: clusterv1.FailureDomainSpec{}, + *a: clusterv1.FailureDomainSpec{ControlPlane: true}, + *b: clusterv1.FailureDomainSpec{ControlPlane: true}, } machinea := &clusterv1.Machine{Spec: clusterv1.MachineSpec{FailureDomain: a}} machineb := &clusterv1.Machine{Spec: clusterv1.MachineSpec{FailureDomain: b}} @@ -120,11 +120,11 @@ func TestNewFailureDomainPickMost(t *testing.T) { expected: nil, }, { - name: "no machines", + name: "no machines should return nil", fds: clusterv1.FailureDomains{ *a: clusterv1.FailureDomainSpec{}, }, - expected: []*string{a}, + expected: nil, }, { name: "one machine in a failure domain", @@ -135,30 +135,40 @@ func TestNewFailureDomainPickMost(t *testing.T) { { name: "no failure domain specified on machine", fds: clusterv1.FailureDomains{ - *a: clusterv1.FailureDomainSpec{}, + *a: clusterv1.FailureDomainSpec{ControlPlane: true}, }, machines: NewFilterableMachineCollection(machinenil.DeepCopy()), - expected: []*string{a}, + expected: nil, }, { - name: "mismatched failure domain on machine", + name: "mismatched failure domain on machine should return nil", fds: clusterv1.FailureDomains{ - *a: clusterv1.FailureDomainSpec{}, + *a: clusterv1.FailureDomainSpec{ControlPlane: true}, }, machines: NewFilterableMachineCollection(machineb.DeepCopy()), - expected: []*string{a}, + expected: nil, }, { - name: "failure domains and no machines should return a valid failure domain", + name: "failure domains and no machines should return nil", fds: fds, - expected: []*string{a, b}, + expected: nil, + }, + { + name: "nil failure domains with machines", + machines: NewFilterableMachineCollection(machineb.DeepCopy()), + expected: nil, + }, + { + name: "nil failure domains with no machines", + expected: nil, }, } for _, tc := range testcases { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - fd := PickMost(tc.fds, tc.machines) + fd := PickMost(&ControlPlane{Machines: tc.machines, + Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: tc.fds}}}, tc.machines) if tc.expected == nil { g.Expect(fd).To(BeNil()) } else {