Skip to content

Commit

Permalink
KCP remediation for 2CP
Browse files Browse the repository at this point in the history
  • Loading branch information
fabriziopandini committed May 12, 2021
1 parent 22c20bd commit 2724404
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 12 deletions.
8 changes: 4 additions & 4 deletions controlplane/kubeadm/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C

desiredReplicas := int(*controlPlane.KCP.Spec.Replicas)

// The cluster MUST have spec.replicas >= 3, because this is the smallest cluster size that allows any etcd failure tolerance.
if desiredReplicas < 3 {
logger.Info("A control plane machine needs remediation, but the number of desired replicas is less than 3. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas")
// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
if controlPlane.Machines.Len() <= 1 {
logger.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", controlPlane.Machines.Len())
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal then 1")
return ctrl.Result{}, nil
}

Expand Down
113 changes: 106 additions & 7 deletions controlplane/kubeadm/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
g.Expect(err).ToNot(HaveOccurred())
})
t.Run("Remediation does not happen if desired replicas < 3", func(t *testing.T) {
t.Run("Remediation does not happen if desired replicas <= 1", func(t *testing.T) {
g := NewWithT(t)

m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
Expand All @@ -92,28 +92,29 @@ func TestReconcileUnhealthyMachines(t *testing.T) {

g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
g.Expect(err).ToNot(HaveOccurred())
assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas")
assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal then 1")

g.Expect(testEnv.Cleanup(ctx, m)).To(Succeed())
})
t.Run("Remediation does not happen if number of machines lower than desired", func(t *testing.T) {
g := NewWithT(t)

m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-healthy-")
controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(3),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m),
Machines: internal.NewFilterableMachineCollection(m1, m2),
}
ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)

g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
g.Expect(err).ToNot(HaveOccurred())
assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least 3 control plane machines before triggering remediation")
assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least 3 control plane machines before triggering remediation")

g.Expect(testEnv.Cleanup(ctx, m)).To(Succeed())
g.Expect(testEnv.Cleanup(ctx, m1, m2)).To(Succeed())
})
t.Run("Remediation does not happen if there is a deleting machine", func(t *testing.T) {
g := NewWithT(t)
Expand Down Expand Up @@ -206,7 +207,55 @@ func TestReconcileUnhealthyMachines(t *testing.T) {

g.Expect(testEnv.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
})
t.Run("Remediation deletes unhealthy machine", func(t *testing.T) {
t.Run("Remediation deletes unhealthy machine - 2 CP (during 1 CP rolling upgrade)", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
patchHelper, err := patch.NewHelper(m1, testEnv.GetClient())
g.Expect(err).ToNot(HaveOccurred())
m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
g.Expect(patchHelper.Patch(ctx, m1))

m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(2),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2),
}

r := &KubeadmControlPlaneReconciler{
Client: testEnv.GetClient(),
Log: log.Log,
recorder: record.NewFakeRecorder(32),
managementCluster: &fakeManagementCluster{
Workload: fakeWorkloadCluster{
EtcdMembersResult: nodes(controlPlane.Machines),
},
},
}

ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)

g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
g.Expect(err).ToNot(HaveOccurred())

assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")

err = testEnv.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
g.Expect(err).ToNot(HaveOccurred())
g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())

patchHelper, err = patch.NewHelper(m1, testEnv.GetClient())
g.Expect(err).ToNot(HaveOccurred())
m1.ObjectMeta.Finalizers = nil
g.Expect(patchHelper.Patch(ctx, m1))

g.Expect(testEnv.Cleanup(ctx, m1, m2)).To(Succeed())
})
t.Run("Remediation deletes unhealthy machine - 3 CP", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
Expand Down Expand Up @@ -255,6 +304,56 @@ func TestReconcileUnhealthyMachines(t *testing.T) {

g.Expect(testEnv.Cleanup(ctx, m1, m2, m3)).To(Succeed())
})
t.Run("Remediation deletes unhealthy machine - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
patchHelper, err := patch.NewHelper(m1, testEnv.GetClient())
g.Expect(err).ToNot(HaveOccurred())
m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
g.Expect(patchHelper.Patch(ctx, m1))

m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember())

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(4),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2, m3, m4),
}

r := &KubeadmControlPlaneReconciler{
Client: testEnv.GetClient(),
Log: log.Log,
recorder: record.NewFakeRecorder(32),
managementCluster: &fakeManagementCluster{
Workload: fakeWorkloadCluster{
EtcdMembersResult: nodes(controlPlane.Machines),
},
},
}

ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)

g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
g.Expect(err).ToNot(HaveOccurred())

assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")

err = testEnv.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
g.Expect(err).ToNot(HaveOccurred())
g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())

patchHelper, err = patch.NewHelper(m1, testEnv.GetClient())
g.Expect(err).ToNot(HaveOccurred())
m1.ObjectMeta.Finalizers = nil
g.Expect(patchHelper.Patch(ctx, m1))

g.Expect(testEnv.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
})

g.Expect(testEnv.Cleanup(ctx, ns)).To(Succeed())
}
Expand Down
2 changes: 1 addition & 1 deletion docs/proposals/20191017-kubeadm-based-control-plane.md
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ spec:
for additional details. When there are multiple machines that are marked for remediation, the oldest one will be remediated first.

- Following rules should be satisfied in order to start remediation
- The cluster MUST have spec.replicas >= 3, because this is the smallest cluster size that allows any etcd failure tolerance.
- The cluster MUST have at least two control plane machines, because this is the smallest cluster size that can be remediated.
- The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster
is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first.
- The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
Expand Down

0 comments on commit 2724404

Please sign in to comment.