Skip to content

Commit

Permalink
Merge pull request #7016 from killianmuldoon/release-1.1
Browse files Browse the repository at this point in the history
🐛  Fix potential nilpointer error in machine remediation
  • Loading branch information
k8s-ci-robot authored Aug 5, 2022
2 parents f9cd695 + 8bc2e59 commit 46706c5
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 10 deletions.
6 changes: 5 additions & 1 deletion controlplane/kubeadm/internal/controllers/fakes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"

"github.com/blang/semver"
"github.com/pkg/errors"
"sigs.k8s.io/controller-runtime/pkg/client"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
Expand Down Expand Up @@ -69,7 +70,10 @@ type fakeWorkloadCluster struct {
EtcdMembersResult []string
}

func (f fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, _ *clusterv1.Machine) error {
func (f fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, leaderCandidate *clusterv1.Machine) error {
if leaderCandidate == nil {
return errors.New("leaderCandidate is nil")
}
return nil
}

Expand Down
25 changes: 17 additions & 8 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/blang/semver"
"github.com/pkg/errors"
kerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

Expand Down Expand Up @@ -106,27 +107,27 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C

// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.

log.WithValues("Machine", klog.KObj(machineToBeRemediated))
desiredReplicas := int(*controlPlane.KCP.Spec.Replicas)

// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
if controlPlane.Machines.Len() <= 1 {
log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", controlPlane.Machines.Len())
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal then 1")
log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len())
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less than or equal to 1")
return ctrl.Result{}, nil
}

// The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster
// is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first.
if controlPlane.Machines.Len() < desiredReplicas {
log.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas, "CurrentReplicas", controlPlane.Machines.Len())
log.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "Replicas", desiredReplicas, "CurrentReplicas", controlPlane.Machines.Len())
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least %d control plane machines before triggering remediation", desiredReplicas)
return ctrl.Result{}, nil
}

// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
if controlPlane.HasDeletingMachine() {
log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation")
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
return ctrl.Result{}, nil
}
Expand All @@ -140,7 +141,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
return ctrl.Result{}, err
}
if !canSafelyRemediate {
log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation")
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
return ctrl.Result{}, nil
}
Expand All @@ -155,12 +156,20 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
if controlPlane.IsEtcdManaged() {
etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
if etcdLeaderCandidate == nil {
log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
return ctrl.Result{}, nil
}
if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
log.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name)
log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate))
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, err
}
if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
log.Error(err, "Failed to remove etcd member for machine")
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, err
}
}
Expand All @@ -180,7 +189,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
}

log.Info("Remediating unhealthy machine", "UnhealthyMachine", machineToBeRemediated.Name)
log.Info("Remediating unhealthy machine")
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
return ctrl.Result{Requeue: true}, nil
}
Expand Down
47 changes: 46 additions & 1 deletion controlplane/kubeadm/internal/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) {

g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
g.Expect(err).ToNot(HaveOccurred())
assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal then 1")
assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less than or equal to 1")

g.Expect(env.Cleanup(ctx, m)).To(Succeed())
})
Expand Down Expand Up @@ -389,6 +389,50 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
m1.ObjectMeta.Finalizers = nil
g.Expect(patchHelper.Patch(ctx, m1))

g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
})
t.Run("Remediation does not happen if no healthy Control Planes are available to become etcd leader", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
patchHelper, err := patch.NewHelper(m1, env.GetClient())
g.Expect(err).ToNot(HaveOccurred())
m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
g.Expect(patchHelper.Patch(ctx, m1))

m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(4),
Version: "v1.19.1",
}},
Cluster: &clusterv1.Cluster{},
Machines: collections.FromMachines(m1, m2, m3, m4),
}

r := &KubeadmControlPlaneReconciler{
Client: env.GetClient(),
recorder: record.NewFakeRecorder(32),
managementCluster: &fakeManagementCluster{
Workload: fakeWorkloadCluster{
EtcdMembersResult: nodes(controlPlane.Machines),
},
},
}
_, err = r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
g.Expect(err).ToNot(HaveOccurred())

assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")

patchHelper, err = patch.NewHelper(m1, env.GetClient())
g.Expect(err).ToNot(HaveOccurred())
m1.ObjectMeta.Finalizers = nil
g.Expect(patchHelper.Patch(ctx, m1))

g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
})
}
Expand Down Expand Up @@ -432,6 +476,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {

g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
})

t.Run("Can safely remediate 2 machine CP without additional etcd member failures", func(t *testing.T) {
g := NewWithT(t)

Expand Down

0 comments on commit 46706c5

Please sign in to comment.