From 27244044ddce9d01f51866b3bbcae61f6c335fa5 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Wed, 12 May 2021 12:34:35 +0200 Subject: [PATCH] KCP remediation for 2CP --- .../kubeadm/controllers/remediation.go | 8 +- .../kubeadm/controllers/remediation_test.go | 113 ++++++++++++++++-- .../20191017-kubeadm-based-control-plane.md | 2 +- 3 files changed, 111 insertions(+), 12 deletions(-) diff --git a/controlplane/kubeadm/controllers/remediation.go b/controlplane/kubeadm/controllers/remediation.go index 854f656fba58..682b5ccabcb8 100644 --- a/controlplane/kubeadm/controllers/remediation.go +++ b/controlplane/kubeadm/controllers/remediation.go @@ -79,10 +79,10 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C desiredReplicas := int(*controlPlane.KCP.Spec.Replicas) - // The cluster MUST have spec.replicas >= 3, because this is the smallest cluster size that allows any etcd failure tolerance. - if desiredReplicas < 3 { - logger.Info("A control plane machine needs remediation, but the number of desired replicas is less than 3. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas) - conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas") + // The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance. + if controlPlane.Machines.Len() <= 1 { + logger.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", controlPlane.Machines.Len()) + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal then 1") return ctrl.Result{}, nil } diff --git a/controlplane/kubeadm/controllers/remediation_test.go b/controlplane/kubeadm/controllers/remediation_test.go index ba4647f356ec..972d56d4d911 100644 --- a/controlplane/kubeadm/controllers/remediation_test.go +++ b/controlplane/kubeadm/controllers/remediation_test.go @@ -77,7 +77,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped g.Expect(err).ToNot(HaveOccurred()) }) - t.Run("Remediation does not happen if desired replicas < 3", func(t *testing.T) { + t.Run("Remediation does not happen if desired replicas <= 1", func(t *testing.T) { g := NewWithT(t) m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) @@ -92,28 +92,29 @@ func TestReconcileUnhealthyMachines(t *testing.T) { g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped g.Expect(err).ToNot(HaveOccurred()) - assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas") + assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal then 1") g.Expect(testEnv.Cleanup(ctx, m)).To(Succeed()) }) t.Run("Remediation does not happen if number of machines lower than desired", func(t *testing.T) { g := NewWithT(t) - m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) + m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) + m2 := createMachine(ctx, g, ns.Name, "m2-healthy-") controlPlane := &internal.ControlPlane{ KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ Replicas: utilpointer.Int32Ptr(3), }}, Cluster: &clusterv1.Cluster{}, - Machines: internal.NewFilterableMachineCollection(m), + Machines: internal.NewFilterableMachineCollection(m1, m2), } ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane) g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped g.Expect(err).ToNot(HaveOccurred()) - assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least 3 control plane machines before triggering remediation") + assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least 3 control plane machines before triggering remediation") - g.Expect(testEnv.Cleanup(ctx, m)).To(Succeed()) + g.Expect(testEnv.Cleanup(ctx, m1, m2)).To(Succeed()) }) t.Run("Remediation does not happen if there is a deleting machine", func(t *testing.T) { g := NewWithT(t) @@ -206,7 +207,55 @@ func TestReconcileUnhealthyMachines(t *testing.T) { g.Expect(testEnv.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed()) }) - t.Run("Remediation deletes unhealthy machine", func(t *testing.T) { + t.Run("Remediation deletes unhealthy machine - 2 CP (during 1 CP rolling upgrade)", func(t *testing.T) { + g := NewWithT(t) + + m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) + patchHelper, err := patch.NewHelper(m1, testEnv.GetClient()) + g.Expect(err).ToNot(HaveOccurred()) + m1.ObjectMeta.Finalizers = []string{"wait-before-delete"} + g.Expect(patchHelper.Patch(ctx, m1)) + + m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) + + controlPlane := &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ + Replicas: utilpointer.Int32Ptr(2), + }}, + Cluster: &clusterv1.Cluster{}, + Machines: internal.NewFilterableMachineCollection(m1, m2), + } + + r := &KubeadmControlPlaneReconciler{ + Client: testEnv.GetClient(), + Log: log.Log, + recorder: record.NewFakeRecorder(32), + managementCluster: &fakeManagementCluster{ + Workload: fakeWorkloadCluster{ + EtcdMembersResult: nodes(controlPlane.Machines), + }, + }, + } + + ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane) + + g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue + g.Expect(err).ToNot(HaveOccurred()) + + assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") + + err = testEnv.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) + + patchHelper, err = patch.NewHelper(m1, testEnv.GetClient()) + g.Expect(err).ToNot(HaveOccurred()) + m1.ObjectMeta.Finalizers = nil + g.Expect(patchHelper.Patch(ctx, m1)) + + g.Expect(testEnv.Cleanup(ctx, m1, m2)).To(Succeed()) + }) + t.Run("Remediation deletes unhealthy machine - 3 CP", func(t *testing.T) { g := NewWithT(t) m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) @@ -255,6 +304,56 @@ func TestReconcileUnhealthyMachines(t *testing.T) { g.Expect(testEnv.Cleanup(ctx, m1, m2, m3)).To(Succeed()) }) + t.Run("Remediation deletes unhealthy machine - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) { + g := NewWithT(t) + + m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) + patchHelper, err := patch.NewHelper(m1, testEnv.GetClient()) + g.Expect(err).ToNot(HaveOccurred()) + m1.ObjectMeta.Finalizers = []string{"wait-before-delete"} + g.Expect(patchHelper.Patch(ctx, m1)) + + m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember()) + m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember()) + m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember()) + + controlPlane := &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ + Replicas: utilpointer.Int32Ptr(4), + }}, + Cluster: &clusterv1.Cluster{}, + Machines: internal.NewFilterableMachineCollection(m1, m2, m3, m4), + } + + r := &KubeadmControlPlaneReconciler{ + Client: testEnv.GetClient(), + Log: log.Log, + recorder: record.NewFakeRecorder(32), + managementCluster: &fakeManagementCluster{ + Workload: fakeWorkloadCluster{ + EtcdMembersResult: nodes(controlPlane.Machines), + }, + }, + } + + ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane) + + g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue + g.Expect(err).ToNot(HaveOccurred()) + + assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") + + err = testEnv.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) + + patchHelper, err = patch.NewHelper(m1, testEnv.GetClient()) + g.Expect(err).ToNot(HaveOccurred()) + m1.ObjectMeta.Finalizers = nil + g.Expect(patchHelper.Patch(ctx, m1)) + + g.Expect(testEnv.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed()) + }) g.Expect(testEnv.Cleanup(ctx, ns)).To(Succeed()) } diff --git a/docs/proposals/20191017-kubeadm-based-control-plane.md b/docs/proposals/20191017-kubeadm-based-control-plane.md index 94c1be76d832..6a67e605db7c 100644 --- a/docs/proposals/20191017-kubeadm-based-control-plane.md +++ b/docs/proposals/20191017-kubeadm-based-control-plane.md @@ -394,7 +394,7 @@ spec: for additional details. When there are multiple machines that are marked for remediation, the oldest one will be remediated first. - Following rules should be satisfied in order to start remediation - - The cluster MUST have spec.replicas >= 3, because this is the smallest cluster size that allows any etcd failure tolerance. + - The cluster MUST have at least two control plane machines, because this is the smallest cluster size that can be remediated. - The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first. - The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.