From df1c28ed95b5e3abf98d01674192fca86531adcf Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Mon, 10 May 2021 19:53:03 +0200 Subject: [PATCH] Allow KCP remediation when the etcd member being remediated is missing --- .../kubeadm/controllers/remediation.go | 30 ++--- .../kubeadm/controllers/remediation_test.go | 121 +++++++++++++++++- 2 files changed, 128 insertions(+), 23 deletions(-) diff --git a/controlplane/kubeadm/controllers/remediation.go b/controlplane/kubeadm/controllers/remediation.go index 14c2a1bc142e..854f656fba58 100644 --- a/controlplane/kubeadm/controllers/remediation.go +++ b/controlplane/kubeadm/controllers/remediation.go @@ -154,7 +154,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C // without loosing etcd quorum. // // The answer mostly depend on the existence of other failing members on top of the one being deleted, and according -// to the etcd fault tolerance specification (see https://github.com/etcd-io/etcd/blob/master/Documentation/faq.md#what-is-failure-tolerance): +// to the etcd fault tolerance specification (see https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance): // - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target // cluster size after deletion is 2, fault tolerance 0) // - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target @@ -190,17 +190,8 @@ func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Co "currentTotalMembers", currentTotalMembers, "currentMembers", etcdMembers) - // The cluster MUST have at least 3 members, because this is the smallest cluster size that allows any etcd failure tolerance. - // - // NOTE: This should not happen given that we are checking the number of replicas before calling this method, however - // given that this could be destructive, this is an additional safeguard. - if currentTotalMembers < 3 { - logger.Info("etcd cluster with less of 3 members can't be safely remediated") - return false, nil - } - - targetTotalMembers := currentTotalMembers - 1 - targetQuorum := targetTotalMembers/2.0 + 1 + // Projects the target etcd cluster after remediation, considering all the etcd members except the one being remediated. + targetTotalMembers := 0 targetUnhealthyMembers := 0 healthyMembers := []string{} @@ -211,6 +202,9 @@ func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Co continue } + // Include the member in the target etcd cluster. + targetTotalMembers++ + // Search for the machine corresponding to the etcd member. var machine *clusterv1.Machine for _, m := range controlPlane.Machines { @@ -241,15 +235,17 @@ func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Co healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name)) } + // See https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance for fault tolerance formula explanation. + targetQuorum := (targetTotalMembers / 2.0) + 1 + canSafelyRemediate := targetTotalMembers-targetUnhealthyMembers >= targetQuorum + logger.Info(fmt.Sprintf("etcd cluster projected after remediation of %s", machineToBeRemediated.Name), "healthyMembers", healthyMembers, "unhealthyMembers", unhealthyMembers, "targetTotalMembers", targetTotalMembers, "targetQuorum", targetQuorum, "targetUnhealthyMembers", targetUnhealthyMembers, - "projectedQuorum", targetTotalMembers-targetUnhealthyMembers) - if targetTotalMembers-targetUnhealthyMembers >= targetQuorum { - return true, nil - } - return false, nil + "canSafelyRemediate", canSafelyRemediate) + + return canSafelyRemediate, nil } diff --git a/controlplane/kubeadm/controllers/remediation_test.go b/controlplane/kubeadm/controllers/remediation_test.go index 462261964b12..ba4647f356ec 100644 --- a/controlplane/kubeadm/controllers/remediation_test.go +++ b/controlplane/kubeadm/controllers/remediation_test.go @@ -19,6 +19,7 @@ package controllers import ( "context" "fmt" + "strings" "testing" . "github.com/onsi/gomega" @@ -295,7 +296,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { g.Expect(testEnv.Cleanup(ctx, m1)).To(Succeed()) }) - t.Run("Can't safely remediate 2 machine CP", func(t *testing.T) { + t.Run("Can safely remediate 2 machine CP without additional etcd member failures", func(t *testing.T) { g := NewWithT(t) m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) @@ -303,7 +304,76 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { controlPlane := &internal.ControlPlane{ KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ - Replicas: utilpointer.Int32Ptr(2), + Replicas: utilpointer.Int32Ptr(3), + }}, + Cluster: &clusterv1.Cluster{}, + Machines: internal.NewFilterableMachineCollection(m1, m2), + } + + r := &KubeadmControlPlaneReconciler{ + Client: testEnv.GetClient(), + Log: log.Log, + recorder: record.NewFakeRecorder(32), + managementCluster: &fakeManagementCluster{ + Workload: fakeWorkloadCluster{ + EtcdMembersResult: nodes(controlPlane.Machines), + }, + }, + } + + ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1) + g.Expect(ret).To(BeTrue()) + g.Expect(err).ToNot(HaveOccurred()) + + g.Expect(testEnv.Cleanup(ctx, m1, m2)).To(Succeed()) + }) + t.Run("Can safely remediate 2 machines CP when the etcd member being remediated is missing", func(t *testing.T) { + g := NewWithT(t) + + m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) + m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember()) + + controlPlane := &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ + Replicas: utilpointer.Int32Ptr(3), + }}, + Cluster: &clusterv1.Cluster{}, + Machines: internal.NewFilterableMachineCollection(m1, m2), + } + + members := make([]string, 0, len(controlPlane.Machines)-1) + for _, n := range nodes(controlPlane.Machines) { + if !strings.Contains(n, "m1-mhc-unhealthy-") { + members = append(members, n) + } + } + + r := &KubeadmControlPlaneReconciler{ + Client: testEnv.GetClient(), + Log: log.Log, + recorder: record.NewFakeRecorder(32), + managementCluster: &fakeManagementCluster{ + Workload: fakeWorkloadCluster{ + EtcdMembersResult: members, + }, + }, + } + + ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1) + g.Expect(ret).To(BeTrue()) + g.Expect(err).ToNot(HaveOccurred()) + + g.Expect(testEnv.Cleanup(ctx, m1, m2)).To(Succeed()) + }) + t.Run("Can't safely remediate 2 machines CP with one additional etcd member failure", func(t *testing.T) { + g := NewWithT(t) + + m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) + m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember()) + + controlPlane := &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ + Replicas: utilpointer.Int32Ptr(3), }}, Cluster: &clusterv1.Cluster{}, Machines: internal.NewFilterableMachineCollection(m1, m2), @@ -335,7 +405,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { controlPlane := &internal.ControlPlane{ KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ - Replicas: utilpointer.Int32Ptr(5), + Replicas: utilpointer.Int32Ptr(3), }}, Cluster: &clusterv1.Cluster{}, Machines: internal.NewFilterableMachineCollection(m1, m2, m3), @@ -358,6 +428,45 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { g.Expect(testEnv.Cleanup(ctx, m1, m2, m3)).To(Succeed()) }) + t.Run("Can safely remediate 3 machines CP when the etcd member being remediated is missing", func(t *testing.T) { + g := NewWithT(t) + + m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed()) + m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember()) + m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember()) + + controlPlane := &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ + Replicas: utilpointer.Int32Ptr(3), + }}, + Cluster: &clusterv1.Cluster{}, + Machines: internal.NewFilterableMachineCollection(m1, m2, m3), + } + + members := make([]string, 0, len(controlPlane.Machines)-1) + for _, n := range nodes(controlPlane.Machines) { + if !strings.Contains(n, "m1-mhc-unhealthy-") { + members = append(members, n) + } + } + + r := &KubeadmControlPlaneReconciler{ + Client: testEnv.GetClient(), + Log: log.Log, + recorder: record.NewFakeRecorder(32), + managementCluster: &fakeManagementCluster{ + Workload: fakeWorkloadCluster{ + EtcdMembersResult: members, + }, + }, + } + + ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1) + g.Expect(ret).To(BeTrue()) + g.Expect(err).ToNot(HaveOccurred()) + + g.Expect(testEnv.Cleanup(ctx, m1, m2, m3)).To(Succeed()) + }) t.Run("Can't safely remediate 3 machines CP with one additional etcd member failure", func(t *testing.T) { g := NewWithT(t) @@ -367,7 +476,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { controlPlane := &internal.ControlPlane{ KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ - Replicas: utilpointer.Int32Ptr(5), + Replicas: utilpointer.Int32Ptr(3), }}, Cluster: &clusterv1.Cluster{}, Machines: internal.NewFilterableMachineCollection(m1, m2, m3), @@ -435,7 +544,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { controlPlane := &internal.ControlPlane{ KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ - Replicas: utilpointer.Int32Ptr(5), + Replicas: utilpointer.Int32Ptr(7), }}, Cluster: &clusterv1.Cluster{}, Machines: internal.NewFilterableMachineCollection(m1, m2, m3, m4, m5), @@ -471,7 +580,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { controlPlane := &internal.ControlPlane{ KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ - Replicas: utilpointer.Int32Ptr(5), + Replicas: utilpointer.Int32Ptr(7), }}, Cluster: &clusterv1.Cluster{}, Machines: internal.NewFilterableMachineCollection(m1, m2, m3, m4, m5, m6, m7),