Skip to content

Commit

Permalink
Merge pull request #4591 from fabriziopandini/KCP-remediation-fix
Browse files Browse the repository at this point in the history
🐛 Allow KCP remediation when the etcd member being remediated is missing
  • Loading branch information
k8s-ci-robot authored May 11, 2021
2 parents 3bf624d + df1c28e commit 22c20bd
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 23 deletions.
30 changes: 13 additions & 17 deletions controlplane/kubeadm/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
// without loosing etcd quorum.
//
// The answer mostly depend on the existence of other failing members on top of the one being deleted, and according
// to the etcd fault tolerance specification (see https://github.com/etcd-io/etcd/blob/master/Documentation/faq.md#what-is-failure-tolerance):
// to the etcd fault tolerance specification (see https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance):
// - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target
// cluster size after deletion is 2, fault tolerance 0)
// - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target
Expand Down Expand Up @@ -190,17 +190,8 @@ func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Co
"currentTotalMembers", currentTotalMembers,
"currentMembers", etcdMembers)

// The cluster MUST have at least 3 members, because this is the smallest cluster size that allows any etcd failure tolerance.
//
// NOTE: This should not happen given that we are checking the number of replicas before calling this method, however
// given that this could be destructive, this is an additional safeguard.
if currentTotalMembers < 3 {
logger.Info("etcd cluster with less of 3 members can't be safely remediated")
return false, nil
}

targetTotalMembers := currentTotalMembers - 1
targetQuorum := targetTotalMembers/2.0 + 1
// Projects the target etcd cluster after remediation, considering all the etcd members except the one being remediated.
targetTotalMembers := 0
targetUnhealthyMembers := 0

healthyMembers := []string{}
Expand All @@ -211,6 +202,9 @@ func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Co
continue
}

// Include the member in the target etcd cluster.
targetTotalMembers++

// Search for the machine corresponding to the etcd member.
var machine *clusterv1.Machine
for _, m := range controlPlane.Machines {
Expand Down Expand Up @@ -241,15 +235,17 @@ func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Co
healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
}

// See https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance for fault tolerance formula explanation.
targetQuorum := (targetTotalMembers / 2.0) + 1
canSafelyRemediate := targetTotalMembers-targetUnhealthyMembers >= targetQuorum

logger.Info(fmt.Sprintf("etcd cluster projected after remediation of %s", machineToBeRemediated.Name),
"healthyMembers", healthyMembers,
"unhealthyMembers", unhealthyMembers,
"targetTotalMembers", targetTotalMembers,
"targetQuorum", targetQuorum,
"targetUnhealthyMembers", targetUnhealthyMembers,
"projectedQuorum", targetTotalMembers-targetUnhealthyMembers)
if targetTotalMembers-targetUnhealthyMembers >= targetQuorum {
return true, nil
}
return false, nil
"canSafelyRemediate", canSafelyRemediate)

return canSafelyRemediate, nil
}
121 changes: 115 additions & 6 deletions controlplane/kubeadm/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package controllers
import (
"context"
"fmt"
"strings"
"testing"

. "github.com/onsi/gomega"
Expand Down Expand Up @@ -295,15 +296,84 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {

g.Expect(testEnv.Cleanup(ctx, m1)).To(Succeed())
})
t.Run("Can't safely remediate 2 machine CP", func(t *testing.T) {
t.Run("Can safely remediate 2 machine CP without additional etcd member failures", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember())

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(2),
Replicas: utilpointer.Int32Ptr(3),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2),
}

r := &KubeadmControlPlaneReconciler{
Client: testEnv.GetClient(),
Log: log.Log,
recorder: record.NewFakeRecorder(32),
managementCluster: &fakeManagementCluster{
Workload: fakeWorkloadCluster{
EtcdMembersResult: nodes(controlPlane.Machines),
},
},
}

ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
g.Expect(ret).To(BeTrue())
g.Expect(err).ToNot(HaveOccurred())

g.Expect(testEnv.Cleanup(ctx, m1, m2)).To(Succeed())
})
t.Run("Can safely remediate 2 machines CP when the etcd member being remediated is missing", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember())

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(3),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2),
}

members := make([]string, 0, len(controlPlane.Machines)-1)
for _, n := range nodes(controlPlane.Machines) {
if !strings.Contains(n, "m1-mhc-unhealthy-") {
members = append(members, n)
}
}

r := &KubeadmControlPlaneReconciler{
Client: testEnv.GetClient(),
Log: log.Log,
recorder: record.NewFakeRecorder(32),
managementCluster: &fakeManagementCluster{
Workload: fakeWorkloadCluster{
EtcdMembersResult: members,
},
},
}

ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
g.Expect(ret).To(BeTrue())
g.Expect(err).ToNot(HaveOccurred())

g.Expect(testEnv.Cleanup(ctx, m1, m2)).To(Succeed())
})
t.Run("Can't safely remediate 2 machines CP with one additional etcd member failure", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(3),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2),
Expand Down Expand Up @@ -335,7 +405,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(5),
Replicas: utilpointer.Int32Ptr(3),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2, m3),
Expand All @@ -358,6 +428,45 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {

g.Expect(testEnv.Cleanup(ctx, m1, m2, m3)).To(Succeed())
})
t.Run("Can safely remediate 3 machines CP when the etcd member being remediated is missing", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember())
m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(3),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2, m3),
}

members := make([]string, 0, len(controlPlane.Machines)-1)
for _, n := range nodes(controlPlane.Machines) {
if !strings.Contains(n, "m1-mhc-unhealthy-") {
members = append(members, n)
}
}

r := &KubeadmControlPlaneReconciler{
Client: testEnv.GetClient(),
Log: log.Log,
recorder: record.NewFakeRecorder(32),
managementCluster: &fakeManagementCluster{
Workload: fakeWorkloadCluster{
EtcdMembersResult: members,
},
},
}

ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
g.Expect(ret).To(BeTrue())
g.Expect(err).ToNot(HaveOccurred())

g.Expect(testEnv.Cleanup(ctx, m1, m2, m3)).To(Succeed())
})
t.Run("Can't safely remediate 3 machines CP with one additional etcd member failure", func(t *testing.T) {
g := NewWithT(t)

Expand All @@ -367,7 +476,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(5),
Replicas: utilpointer.Int32Ptr(3),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2, m3),
Expand Down Expand Up @@ -435,7 +544,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(5),
Replicas: utilpointer.Int32Ptr(7),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2, m3, m4, m5),
Expand Down Expand Up @@ -471,7 +580,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(5),
Replicas: utilpointer.Int32Ptr(7),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2, m3, m4, m5, m6, m7),
Expand Down

0 comments on commit 22c20bd

Please sign in to comment.