Skip to content

Commit

Permalink
Merge pull request #4595 from fabriziopandini/KCP-remediation-2CP
Browse files Browse the repository at this point in the history
🌱 Update KCP remediation docs and message to support > 1 replicas
  • Loading branch information
k8s-ci-robot authored May 12, 2021
2 parents 22c20bd + 2724404 commit a9da301
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 12 deletions.
8 changes: 4 additions & 4 deletions controlplane/kubeadm/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C

desiredReplicas := int(*controlPlane.KCP.Spec.Replicas)

// The cluster MUST have spec.replicas >= 3, because this is the smallest cluster size that allows any etcd failure tolerance.
if desiredReplicas < 3 {
logger.Info("A control plane machine needs remediation, but the number of desired replicas is less than 3. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas")
// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
if controlPlane.Machines.Len() <= 1 {
logger.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", controlPlane.Machines.Len())
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal then 1")
return ctrl.Result{}, nil
}

Expand Down
113 changes: 106 additions & 7 deletions controlplane/kubeadm/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
g.Expect(err).ToNot(HaveOccurred())
})
t.Run("Remediation does not happen if desired replicas < 3", func(t *testing.T) {
t.Run("Remediation does not happen if desired replicas <= 1", func(t *testing.T) {
g := NewWithT(t)

m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
Expand All @@ -92,28 +92,29 @@ func TestReconcileUnhealthyMachines(t *testing.T) {

g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
g.Expect(err).ToNot(HaveOccurred())
assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas")
assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal then 1")

g.Expect(testEnv.Cleanup(ctx, m)).To(Succeed())
})
t.Run("Remediation does not happen if number of machines lower than desired", func(t *testing.T) {
g := NewWithT(t)

m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-healthy-")
controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(3),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m),
Machines: internal.NewFilterableMachineCollection(m1, m2),
}
ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)

g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
g.Expect(err).ToNot(HaveOccurred())
assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least 3 control plane machines before triggering remediation")
assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least 3 control plane machines before triggering remediation")

g.Expect(testEnv.Cleanup(ctx, m)).To(Succeed())
g.Expect(testEnv.Cleanup(ctx, m1, m2)).To(Succeed())
})
t.Run("Remediation does not happen if there is a deleting machine", func(t *testing.T) {
g := NewWithT(t)
Expand Down Expand Up @@ -206,7 +207,55 @@ func TestReconcileUnhealthyMachines(t *testing.T) {

g.Expect(testEnv.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
})
t.Run("Remediation deletes unhealthy machine", func(t *testing.T) {
t.Run("Remediation deletes unhealthy machine - 2 CP (during 1 CP rolling upgrade)", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
patchHelper, err := patch.NewHelper(m1, testEnv.GetClient())
g.Expect(err).ToNot(HaveOccurred())
m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
g.Expect(patchHelper.Patch(ctx, m1))

m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(2),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2),
}

r := &KubeadmControlPlaneReconciler{
Client: testEnv.GetClient(),
Log: log.Log,
recorder: record.NewFakeRecorder(32),
managementCluster: &fakeManagementCluster{
Workload: fakeWorkloadCluster{
EtcdMembersResult: nodes(controlPlane.Machines),
},
},
}

ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)

g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
g.Expect(err).ToNot(HaveOccurred())

assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")

err = testEnv.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
g.Expect(err).ToNot(HaveOccurred())
g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())

patchHelper, err = patch.NewHelper(m1, testEnv.GetClient())
g.Expect(err).ToNot(HaveOccurred())
m1.ObjectMeta.Finalizers = nil
g.Expect(patchHelper.Patch(ctx, m1))

g.Expect(testEnv.Cleanup(ctx, m1, m2)).To(Succeed())
})
t.Run("Remediation deletes unhealthy machine - 3 CP", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
Expand Down Expand Up @@ -255,6 +304,56 @@ func TestReconcileUnhealthyMachines(t *testing.T) {

g.Expect(testEnv.Cleanup(ctx, m1, m2, m3)).To(Succeed())
})
t.Run("Remediation deletes unhealthy machine - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
patchHelper, err := patch.NewHelper(m1, testEnv.GetClient())
g.Expect(err).ToNot(HaveOccurred())
m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
g.Expect(patchHelper.Patch(ctx, m1))

m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember())

controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
Replicas: utilpointer.Int32Ptr(4),
}},
Cluster: &clusterv1.Cluster{},
Machines: internal.NewFilterableMachineCollection(m1, m2, m3, m4),
}

r := &KubeadmControlPlaneReconciler{
Client: testEnv.GetClient(),
Log: log.Log,
recorder: record.NewFakeRecorder(32),
managementCluster: &fakeManagementCluster{
Workload: fakeWorkloadCluster{
EtcdMembersResult: nodes(controlPlane.Machines),
},
},
}

ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)

g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
g.Expect(err).ToNot(HaveOccurred())

assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")

err = testEnv.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
g.Expect(err).ToNot(HaveOccurred())
g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())

patchHelper, err = patch.NewHelper(m1, testEnv.GetClient())
g.Expect(err).ToNot(HaveOccurred())
m1.ObjectMeta.Finalizers = nil
g.Expect(patchHelper.Patch(ctx, m1))

g.Expect(testEnv.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
})

g.Expect(testEnv.Cleanup(ctx, ns)).To(Succeed())
}
Expand Down
2 changes: 1 addition & 1 deletion docs/proposals/20191017-kubeadm-based-control-plane.md
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ spec:
for additional details. When there are multiple machines that are marked for remediation, the oldest one will be remediated first.

- Following rules should be satisfied in order to start remediation
- The cluster MUST have spec.replicas >= 3, because this is the smallest cluster size that allows any etcd failure tolerance.
- The cluster MUST have at least two control plane machines, because this is the smallest cluster size that can be remediated.
- The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster
is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first.
- The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
Expand Down

0 comments on commit a9da301

Please sign in to comment.