diff --git a/controlplane/kubeadm/internal/controllers/fakes_test.go b/controlplane/kubeadm/internal/controllers/fakes_test.go index e9de1d951413..e93f9bb0a43d 100644 --- a/controlplane/kubeadm/internal/controllers/fakes_test.go +++ b/controlplane/kubeadm/internal/controllers/fakes_test.go @@ -20,6 +20,7 @@ import ( "context" "github.com/blang/semver" + "github.com/pkg/errors" "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -69,7 +70,10 @@ type fakeWorkloadCluster struct { EtcdMembersResult []string } -func (f fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, _ *clusterv1.Machine) error { +func (f fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, leaderCandidate *clusterv1.Machine) error { + if leaderCandidate == nil { + return errors.New("leaderCandidate is nil") + } return nil } diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index 384a5d6f6dd6..dd799e958cd7 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -23,6 +23,7 @@ import ( "github.com/blang/semver" "github.com/pkg/errors" kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -106,27 +107,27 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C // Before starting remediation, run preflight checks in order to verify it is safe to remediate. // If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition. - + log.WithValues("Machine", klog.KObj(machineToBeRemediated)) desiredReplicas := int(*controlPlane.KCP.Spec.Replicas) // The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance. if controlPlane.Machines.Len() <= 1 { - log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", controlPlane.Machines.Len()) - conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal then 1") + log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len()) + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less than or equal to 1") return ctrl.Result{}, nil } // The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster // is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first. if controlPlane.Machines.Len() < desiredReplicas { - log.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas, "CurrentReplicas", controlPlane.Machines.Len()) + log.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "Replicas", desiredReplicas, "CurrentReplicas", controlPlane.Machines.Len()) conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least %d control plane machines before triggering remediation", desiredReplicas) return ctrl.Result{}, nil } // The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state. if controlPlane.HasDeletingMachine() { - log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name) + log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation") conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation") return ctrl.Result{}, nil } @@ -140,7 +141,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C return ctrl.Result{}, err } if !canSafelyRemediate { - log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name) + log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation") conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum") return ctrl.Result{}, nil } @@ -155,12 +156,20 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C // If the machine that is about to be deleted is the etcd leader, move it to the newest member available. if controlPlane.IsEtcdManaged() { etcdLeaderCandidate := controlPlane.HealthyMachines().Newest() + if etcdLeaderCandidate == nil { + log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to") + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning, + "A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation") + return ctrl.Result{}, nil + } if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil { - log.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name) + log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate)) + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) return ctrl.Result{}, err } if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil { log.Error(err, "Failed to remove etcd member for machine") + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) return ctrl.Result{}, err } } @@ -180,7 +189,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name) } - log.Info("Remediating unhealthy machine", "UnhealthyMachine", machineToBeRemediated.Name) + log.Info("Remediating unhealthy machine") conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") return ctrl.Result{Requeue: true}, nil } diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go index e0a6ed81cfae..44b81cc2fa1f 100644 --- a/controlplane/kubeadm/internal/controllers/remediation_test.go +++ b/controlplane/kubeadm/internal/controllers/remediation_test.go @@ -130,7 +130,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped g.Expect(err).ToNot(HaveOccurred()) - assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal then 1") + assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less than or equal to 1") g.Expect(env.Cleanup(ctx, m)).To(Succeed()) }) @@ -389,6 +389,50 @@ func TestReconcileUnhealthyMachines(t *testing.T) { m1.ObjectMeta.Finalizers = nil g.Expect(patchHelper.Patch(ctx, m1)) + g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed()) + }) + t.Run("Remediation does not happen if no healthy Control Planes are available to become etcd leader", func(t *testing.T) { + g := NewWithT(t) + + m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) + patchHelper, err := patch.NewHelper(m1, env.GetClient()) + g.Expect(err).ToNot(HaveOccurred()) + m1.ObjectMeta.Finalizers = []string{"wait-before-delete"} + g.Expect(patchHelper.Patch(ctx, m1)) + + m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember()) + m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember()) + m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember()) + + controlPlane := &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{ + Replicas: utilpointer.Int32Ptr(4), + Version: "v1.19.1", + }}, + Cluster: &clusterv1.Cluster{}, + Machines: collections.FromMachines(m1, m2, m3, m4), + } + + r := &KubeadmControlPlaneReconciler{ + Client: env.GetClient(), + recorder: record.NewFakeRecorder(32), + managementCluster: &fakeManagementCluster{ + Workload: fakeWorkloadCluster{ + EtcdMembersResult: nodes(controlPlane.Machines), + }, + }, + } + _, err = r.reconcileUnhealthyMachines(context.TODO(), controlPlane) + g.Expect(err).ToNot(HaveOccurred()) + + assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning, + "A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation") + + patchHelper, err = patch.NewHelper(m1, env.GetClient()) + g.Expect(err).ToNot(HaveOccurred()) + m1.ObjectMeta.Finalizers = nil + g.Expect(patchHelper.Patch(ctx, m1)) + g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed()) }) } @@ -432,6 +476,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { g.Expect(env.Cleanup(ctx, m1)).To(Succeed()) }) + t.Run("Can safely remediate 2 machine CP without additional etcd member failures", func(t *testing.T) { g := NewWithT(t)