Skip to content

Commit

Permalink
Merge pull request #2833 from sedefsavas/selectmostlogic
Browse files Browse the repository at this point in the history
🐛[KCP] Fix for selecting failure domain logic during upgrade
  • Loading branch information
k8s-ci-robot authored Apr 6, 2020
2 parents 0c7d6e0 + 47a9925 commit e34f9ac
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 52 deletions.
14 changes: 5 additions & 9 deletions controlplane/kubeadm/controllers/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,18 +97,14 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: deleteRequeueAfter}
}

// If there is not already a Machine that is marked for scale down, find one and mark it
markedForDeletion := selectedMachines.Filter(machinefilters.HasAnnotationKey(controlplanev1.DeleteForScaleDownAnnotation))
if len(markedForDeletion) == 0 {
fd := controlPlane.FailureDomainWithMostMachines(selectedMachines)
machinesInFailureDomain := selectedMachines.Filter(machinefilters.InFailureDomains(fd))
machineToMark := machinesInFailureDomain.Oldest()
if machineToMark == nil {
return ctrl.Result{}, errors.New("failed to pick control plane Machine to mark for deletion")
machineToMark, err := r.selectAndMarkMachine(ctx, selectedMachines, controlplanev1.DeleteForScaleDownAnnotation, controlPlane)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "failed to select and mark machine for scale down")
}
if err := r.markWithAnnotationKey(ctx, machineToMark, controlplanev1.DeleteForScaleDownAnnotation); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "failed to mark machine %s for deletion", machineToMark.Name)
}
markedForDeletion.Insert(machinesInFailureDomain.Oldest())
markedForDeletion.Insert(machineToMark)
}

machineToDelete := markedForDeletion.Oldest()
Expand Down
18 changes: 9 additions & 9 deletions controlplane/kubeadm/controllers/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane(
// If there is not already a Machine that is marked for upgrade, find one and mark it
selectedForUpgrade := requireUpgrade.Filter(machinefilters.HasAnnotationKey(controlplanev1.SelectedForUpgradeAnnotation))
if len(selectedForUpgrade) == 0 {
selectedMachine, err := r.selectMachineForUpgrade(ctx, cluster, requireUpgrade, controlPlane)
selectedMachine, err := r.selectAndMarkMachine(ctx, requireUpgrade, controlplanev1.SelectedForUpgradeAnnotation, controlPlane)
if err != nil {
logger.Error(err, "failed to select machine for upgrade")
return ctrl.Result{}, err
Expand All @@ -111,14 +111,14 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane(
return r.scaleDownControlPlane(ctx, cluster, kcp, ownedMachines, replacementCreated, controlPlane)
}

func (r *KubeadmControlPlaneReconciler) selectMachineForUpgrade(ctx context.Context, _ *clusterv1.Cluster, requireUpgrade internal.FilterableMachineCollection, controlPlane *internal.ControlPlane) (*clusterv1.Machine, error) {
failureDomain := controlPlane.FailureDomainWithMostMachines(requireUpgrade)

inFailureDomain := requireUpgrade.Filter(machinefilters.InFailureDomains(failureDomain))
selected := inFailureDomain.Oldest()

if err := r.markWithAnnotationKey(ctx, selected, controlplanev1.SelectedForUpgradeAnnotation); err != nil {
return nil, errors.Wrap(err, "failed to select and mark a machine for upgrade")
func (r *KubeadmControlPlaneReconciler) selectAndMarkMachine(ctx context.Context, machines internal.FilterableMachineCollection, annotation string, controlPlane *internal.ControlPlane) (*clusterv1.Machine, error) {
selected, err := controlPlane.MachineInFailureDomainWithMostMachines(machines)
if err != nil {
return nil, errors.Wrap(err, "failed to select and mark a machine")
}
//annotation
if err := r.markWithAnnotationKey(ctx, selected, annotation); err != nil {
return nil, errors.Wrap(err, "failed to select and mark a machine")
}

return selected, nil
Expand Down
62 changes: 48 additions & 14 deletions controlplane/kubeadm/controllers/upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,34 +85,41 @@ func TestSelectMachineForUpgrade(t *testing.T) {
cluster, kcp, genericMachineTemplate := createClusterWithControlPlane()
kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil

m1 := machine("machine-1", withFailureDomain("one"))
m2 := machine("machine-2", withFailureDomain("two"), withTimestamp(metav1.Time{Time: time.Date(1, 0, 0, 0, 0, 0, 0, time.UTC)}))
m3 := machine("machine-3", withFailureDomain("two"), withTimestamp(metav1.Time{Time: time.Date(2, 0, 0, 0, 0, 0, 0, time.UTC)}))
m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(metav1.Time{Time: time.Date(1, 0, 0, 0, 0, 0, 0, time.UTC)}))
m2 := machine("machine-2", withFailureDomain("two"), withTimestamp(metav1.Time{Time: time.Date(2, 0, 0, 0, 0, 0, 0, time.UTC)}))
m3 := machine("machine-3", withFailureDomain("two"), withTimestamp(metav1.Time{Time: time.Date(3, 0, 0, 0, 0, 0, 0, time.UTC)}))
m4 := machine("machine-4", withFailureDomain("four"))

mc1 := internal.FilterableMachineCollection{
mc1 := internal.FilterableMachineCollection{"machine-1": m1}
mc2 := internal.FilterableMachineCollection{
"machine-1": m1,
"machine-2": m2,
}
mc3 := internal.FilterableMachineCollection{
"machine-1": m1,
"machine-2": m2,
"machine-3": m3,
}
fd1 := clusterv1.FailureDomains{
fd := clusterv1.FailureDomains{
"one": failureDomain(true),
"two": failureDomain(true),
"three": failureDomain(true),
"four": failureDomain(false),
"three": failureDomain(false),
}

controlPlane := &internal.ControlPlane{
controlPlane3Machines := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{},
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd1}},
Machines: mc1,
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}},
Machines: mc3,
}

fakeClient := newFakeClient(
g,
cluster.DeepCopy(),
kcp.DeepCopy(),
genericMachineTemplate.DeepCopy(),
m1.DeepCopy(),
m2.DeepCopy(),
m4.DeepCopy(),
)

r := &KubeadmControlPlaneReconciler{
Expand All @@ -128,17 +135,44 @@ func TestSelectMachineForUpgrade(t *testing.T) {
testCases := []struct {
name string
upgradeMachines internal.FilterableMachineCollection
cpMachines internal.FilterableMachineCollection
cp *internal.ControlPlane
expectErr bool
expectedMachine clusterv1.Machine
}{
{
name: "matching controlplane machines and upgrade machines",
name: "When controlplane and upgrade machines are same, picks the oldest failure domain with most machines ",
upgradeMachines: mc3,
cp: controlPlane3Machines,
expectErr: false,
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-2"}},
},
{
name: "Picks the upgrade machine even if it does not belong to the fd with most machines",
upgradeMachines: mc1,
cpMachines: controlPlane.Machines,
cp: controlPlane3Machines,
expectErr: false,
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-1"}},
},
{
name: "Picks the upgrade machine that belongs to the fd with most machines",
upgradeMachines: mc2,
cp: controlPlane3Machines,
expectErr: false,
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-2"}},
},
{
name: "Picks the upgrade machine that is not in existing failure domains",
upgradeMachines: internal.FilterableMachineCollection{"machine-4": m4},
cp: controlPlane3Machines,
expectErr: false,
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-4"}},
},
{
name: "Marking machine with annotation fails",
upgradeMachines: internal.FilterableMachineCollection{"machine-3": m3},
cp: controlPlane3Machines,
expectErr: true,
},
}

for _, tc := range testCases {
Expand All @@ -147,7 +181,7 @@ func TestSelectMachineForUpgrade(t *testing.T) {

g.Expect(clusterv1.AddToScheme(scheme.Scheme)).To(Succeed())

selectedMachine, err := r.selectMachineForUpgrade(context.Background(), cluster, controlPlane.Machines, controlPlane)
selectedMachine, err := r.selectAndMarkMachine(context.Background(), tc.upgradeMachines, controlplanev1.SelectedForUpgradeAnnotation, tc.cp)

if tc.expectErr {
g.Expect(err).To(HaveOccurred())
Expand Down
19 changes: 15 additions & 4 deletions controlplane/kubeadm/internal/control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package internal

import (
"github.com/go-logr/logr"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apiserver/pkg/storage/names"
Expand Down Expand Up @@ -107,8 +108,19 @@ func (c *ControlPlane) MachinesNeedingUpgrade() FilterableMachineCollection {
)
}

// FailureDomainWithMostMachines returns the failure domain with the most number of machines.
// Used when scaling down.
// MachineInFailureDomainWithMostMachines returns the first matching failure domain with machines that has the most control-plane machines on it.
func (c *ControlPlane) MachineInFailureDomainWithMostMachines(machines FilterableMachineCollection) (*clusterv1.Machine, error) {
fd := c.FailureDomainWithMostMachines(machines)
machinesInFailureDomain := machines.Filter(machinefilters.InFailureDomains(fd))
machineToMark := machinesInFailureDomain.Oldest()
if machineToMark == nil {
return nil, errors.New("failed to pick control plane Machine to mark for deletion")
}
return machineToMark, nil
}

// FailureDomainWithMostMachines returns a fd which exists both in machines and control-plane machines and has the most
// control-plane machines on it.
func (c *ControlPlane) FailureDomainWithMostMachines(machines FilterableMachineCollection) *string {
// See if there are any Machines that are not in currently defined failure domains first.
notInFailureDomains := machines.Filter(
Expand All @@ -121,8 +133,7 @@ func (c *ControlPlane) FailureDomainWithMostMachines(machines FilterableMachineC
return notInFailureDomains.Oldest().Spec.FailureDomain
}

// Otherwise pick the currently known failure domain with the most Machines
return PickMost(c.Cluster.Status.FailureDomains.FilterControlPlane(), machines)
return PickMost(c, machines)
}

// FailureDomainWithFewestMachines returns the failure domain with the fewest number of machines.
Expand Down
24 changes: 20 additions & 4 deletions controlplane/kubeadm/internal/failure_domain.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,31 @@ func (f failureDomainAggregations) Swap(i, j int) {
f[i], f[j] = f[j], f[i]
}

// PickMost returns the failure domain with the most number of machines.
func PickMost(failureDomains clusterv1.FailureDomains, machines FilterableMachineCollection) *string {
// PickMost returns a failure domain that is in machines and has most control-plane machines on.
func PickMost(c *ControlPlane, machines FilterableMachineCollection) *string {
// orderDescending sorts failure domains according to all control plane machines
fds := orderDescending(c.Cluster.Status.FailureDomains.FilterControlPlane(), c.Machines)
for _, fd := range fds {
for _, m := range machines {
if m.Spec.FailureDomain == nil {
continue
}
if *m.Spec.FailureDomain == fd.id {
return &fd.id
}
}
}
return nil
}

// orderDescending returns the sorted failure domains in decreasing order.
func orderDescending(failureDomains clusterv1.FailureDomains, machines FilterableMachineCollection) failureDomainAggregations {
aggregations := pick(failureDomains, machines)
if len(aggregations) == 0 {
return nil
}
sort.Sort(sort.Reverse(aggregations))
return pointer.StringPtr(aggregations[0].id)

return aggregations
}

// PickFewest returns the failure domain with the fewest number of machines.
Expand Down
34 changes: 22 additions & 12 deletions controlplane/kubeadm/internal/failure_domain_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ func TestNewFailureDomainPickMost(t *testing.T) {
b := pointer.StringPtr("us-west-1b")

fds := clusterv1.FailureDomains{
*a: clusterv1.FailureDomainSpec{},
*b: clusterv1.FailureDomainSpec{},
*a: clusterv1.FailureDomainSpec{ControlPlane: true},
*b: clusterv1.FailureDomainSpec{ControlPlane: true},
}
machinea := &clusterv1.Machine{Spec: clusterv1.MachineSpec{FailureDomain: a}}
machineb := &clusterv1.Machine{Spec: clusterv1.MachineSpec{FailureDomain: b}}
Expand All @@ -120,11 +120,11 @@ func TestNewFailureDomainPickMost(t *testing.T) {
expected: nil,
},
{
name: "no machines",
name: "no machines should return nil",
fds: clusterv1.FailureDomains{
*a: clusterv1.FailureDomainSpec{},
},
expected: []*string{a},
expected: nil,
},
{
name: "one machine in a failure domain",
Expand All @@ -135,30 +135,40 @@ func TestNewFailureDomainPickMost(t *testing.T) {
{
name: "no failure domain specified on machine",
fds: clusterv1.FailureDomains{
*a: clusterv1.FailureDomainSpec{},
*a: clusterv1.FailureDomainSpec{ControlPlane: true},
},
machines: NewFilterableMachineCollection(machinenil.DeepCopy()),
expected: []*string{a},
expected: nil,
},
{
name: "mismatched failure domain on machine",
name: "mismatched failure domain on machine should return nil",
fds: clusterv1.FailureDomains{
*a: clusterv1.FailureDomainSpec{},
*a: clusterv1.FailureDomainSpec{ControlPlane: true},
},
machines: NewFilterableMachineCollection(machineb.DeepCopy()),
expected: []*string{a},
expected: nil,
},
{
name: "failure domains and no machines should return a valid failure domain",
name: "failure domains and no machines should return nil",
fds: fds,
expected: []*string{a, b},
expected: nil,
},
{
name: "nil failure domains with machines",
machines: NewFilterableMachineCollection(machineb.DeepCopy()),
expected: nil,
},
{
name: "nil failure domains with no machines",
expected: nil,
},
}
for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
g := NewWithT(t)

fd := PickMost(tc.fds, tc.machines)
fd := PickMost(&ControlPlane{Machines: tc.machines,
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: tc.fds}}}, tc.machines)
if tc.expected == nil {
g.Expect(fd).To(BeNil())
} else {
Expand Down

0 comments on commit e34f9ac

Please sign in to comment.