KCP remediates unhealthy machines

kubernetes-sigs · Nov 12, 2020 · b34a457 · b34a457
1 parent 24ac840
commit b34a457
Show file tree

Hide file tree

Showing 14 changed files with 1,043 additions and 20 deletions.
diff --git a/api/v1alpha3/condition_consts.go b/api/v1alpha3/condition_consts.go
@@ -131,6 +131,12 @@ const (
 	// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
 	WaitingForRemediationReason = "WaitingForRemediation"
 
+	// RemediationFailedReason is the reason used when a remediation owner fails to remediate an unhealthy machine.
+	RemediationFailedReason = "RemediationFailed"
+
+	// RemediationInProgressReason is the reason used when an unhealthy machine is being remediated by the remediation owner.
+	RemediationInProgressReason = "RemediationInProgress"
+
 	// ExternalRemediationTemplateAvailable is set on machinehealthchecks when MachineHealthCheck controller uses external remediation.
 	// ExternalRemediationTemplateAvailable is set to false if external remediation template is not found.
 	ExternalRemediationTemplateAvailable ConditionType = "ExternalRemediationTemplateAvailable"

diff --git a/controllers/machine_controller.go b/controllers/machine_controller.go
@@ -219,10 +219,13 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
 	// after provisioning - e.g. when a MHC condition exists - or during the deletion process).
 	conditions.SetSummary(machine,
 		conditions.WithConditions(
+			// Infrastructure problems should take precedence over all the other conditions
 			clusterv1.InfrastructureReadyCondition,
+			// Boostrap comes after, but it is relevant only during initial machine provisioning.
 			clusterv1.BootstrapReadyCondition,
-			clusterv1.MachineOwnerRemediatedCondition,
+			// MHC reported condition should take precedence over the remediation progress
 			clusterv1.MachineHealthCheckSuccededCondition,
+			clusterv1.MachineOwnerRemediatedCondition,
 		),
 		conditions.WithStepCounterIf(machine.ObjectMeta.DeletionTimestamp.IsZero()),
 		conditions.WithStepCounterIfOnly(
@@ -240,8 +243,8 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
 			clusterv1.BootstrapReadyCondition,
 			clusterv1.InfrastructureReadyCondition,
 			clusterv1.DrainingSucceededCondition,
-			clusterv1.MachineOwnerRemediatedCondition,
 			clusterv1.MachineHealthCheckSuccededCondition,
+			clusterv1.MachineOwnerRemediatedCondition,
 		}},
 	)
 

diff --git a/controllers/machinehealthcheck_controller.go b/controllers/machinehealthcheck_controller.go
@@ -382,7 +382,11 @@ func (r *MachineHealthCheckReconciler) PatchUnhealthyTargets(ctx context.Context
 				}
 			} else {
 				r.Log.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
-				conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "MachineHealthCheck failed")
+				// NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed;
+				// instead, if a remediation is in already progress, the remediation owner is responsible for completing the process and MHC should not overwrite the condition.
+				if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) {
+					conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
+				}
 			}
 		}
 

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
@@ -302,7 +302,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 
 	// Aggregate the operational state of all the machines; while aggregating we are adding the
 	// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
-	conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, controlPlane.Machines.ConditionGetters(), conditions.AddSourceRef())
+	conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef(), conditions.WithStepCounterIf(false))
 
 	// Updates conditions reporting the status of static pods and the status of the etcd cluster.
 	// NOTE: Conditions reporting KCP operation progress like e.g. Resized or SpecUpToDate are inlined with the rest of the execution.
@@ -316,6 +316,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 		return result, err
 	}
 
+	// Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate,
+	// otherwise continue with the other KCP operations.
+	if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() {
+		return result, err
+	}
+
 	// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
 	needRollout := controlPlane.MachinesNeedingRollout()
 	switch {

diff --git a/controlplane/kubeadm/controllers/fakes_test.go b/controlplane/kubeadm/controllers/fakes_test.go
@@ -56,7 +56,8 @@ func (f *fakeManagementCluster) GetMachinesForCluster(c context.Context, n clien
 
 type fakeWorkloadCluster struct {
 	*internal.Workload
-	Status internal.ClusterStatus
+	Status            internal.ClusterStatus
+	EtcdMembersResult []string
 }
 
 func (f fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, _ *clusterv1.Machine) error {
@@ -95,6 +96,18 @@ func (f fakeWorkloadCluster) UpdateKubeletConfigMap(ctx context.Context, version
 	return nil
 }
 
+func (f fakeWorkloadCluster) RemoveEtcdMemberForMachine(ctx context.Context, machine *clusterv1.Machine) error {
+	return nil
+}
+
+func (f fakeWorkloadCluster) RemoveMachineFromKubeadmConfigMap(ctx context.Context, machine *clusterv1.Machine) error {
+	return nil
+}
+
+func (f fakeWorkloadCluster) EtcdMembers(_ context.Context) ([]string, error) {
+	return f.EtcdMembersResult, nil
+}
+
 type fakeMigrator struct {
 	migrateCalled    bool
 	migrateErr       error

diff --git a/controlplane/kubeadm/controllers/remediation.go b/controlplane/kubeadm/controllers/remediation.go
@@ -0,0 +1,255 @@
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controllers
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/pkg/errors"
+	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
+	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
+	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
+	"sigs.k8s.io/cluster-api/util"
+	"sigs.k8s.io/cluster-api/util/conditions"
+	"sigs.k8s.io/cluster-api/util/patch"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+// reconcileUnhealthyMachines tries to remediate KubeadmControlPlane unhealthy machines
+// based on the process described in https://github.com/kubernetes-sigs/cluster-api/blob/master/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate
+func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *internal.ControlPlane) (ret ctrl.Result, retErr error) {
+	logger := r.Log.WithValues("namespace", controlPlane.KCP.Namespace, "kubeadmControlPlane", controlPlane.KCP.Name, "cluster", controlPlane.Cluster.Name)
+
+	// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
+	// and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation.
+	unhealthyMachines := controlPlane.UnhealthyMachines()
+
+	// If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil).
+	if len(unhealthyMachines) == 0 {
+		return ctrl.Result{}, nil
+	}
+
+	// Select the machine to be remediated, which is the oldest machine marked as unhealthy.
+	//
+	// NOTE: The current solution is considered acceptable for the most frequent use case (only one unhealthy machine),
+	// however, in the future this could potentially be improved for the scenario where more than one unhealthy machine exists
+	// by considering which machine has lower impact on etcd quorum.
+	machineToBeRemediated := unhealthyMachines.Oldest()
+
+	// Returns if the machine is in the process of being deleted.
+	if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() {
+		return ctrl.Result{}, nil
+	}
+
+	patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
+	if err != nil {
+		return ctrl.Result{}, err
+	}
+
+	defer func() {
+		// Always attempt to Patch the Machine conditions after each reconcileUnhealthyMachines.
+		if err := patchHelper.Patch(ctx, machineToBeRemediated, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
+			clusterv1.MachineOwnerRemediatedCondition,
+		}}); err != nil {
+			logger.Error(err, "Failed to patch control plane Machine", "machine", machineToBeRemediated.Name)
+			if retErr == nil {
+				retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name)
+			}
+		}
+	}()
+
+	// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
+	// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.
+
+	desiredReplicas := int(*controlPlane.KCP.Spec.Replicas)
+
+	// The cluster MUST have spec.replicas >= 3, because this is the smallest cluster size that allows any etcd failure tolerance.
+	if desiredReplicas < 3 {
+		logger.Info("A control plane machine needs remediation, but the number of desired replicas is less than 3. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas)
+		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas")
+		return ctrl.Result{}, nil
+	}
+
+	// The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster
+	// is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first.
+	if controlPlane.Machines.Len() < desiredReplicas {
+		logger.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas, "CurrentReplicas", controlPlane.Machines.Len())
+		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least %d control plane machines before triggering remediation", desiredReplicas)
+		return ctrl.Result{}, nil
+	}
+
+	// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
+	if controlPlane.HasDeletingMachine() {
+		logger.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
+		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
+		return ctrl.Result{}, nil
+	}
+
+	// Remediation MUST preserve etcd quorum. This rule ensures that we will not remove a member that would result in etcd
+	// losing a majority of members and thus become unable to field new requests.
+	if controlPlane.IsEtcdManaged() {
+		canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated)
+		if err != nil {
+			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
+			return ctrl.Result{}, err
+		}
+		if !canSafelyRemediate {
+			logger.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
+			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
+			return ctrl.Result{}, nil
+		}
+	}
+
+	workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster))
+	if err != nil {
+		logger.Error(err, "Failed to create client to workload cluster")
+		return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
+	}
+
+	// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
+	if controlPlane.IsEtcdManaged() {
+		etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
+		if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
+			logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name)
+			return ctrl.Result{}, err
+		}
+		if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
+			logger.Error(err, "Failed to remove etcd member for machine")
+			return ctrl.Result{}, err
+		}
+	}
+
+	if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated); err != nil {
+		logger.Error(err, "Failed to remove machine from kubeadm ConfigMap")
+		return ctrl.Result{}, err
+	}
+
+	if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil {
+		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
+		return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
+	}
+
+	logger.Info("Remediating unhealthy machine", "UnhealthyMachine", machineToBeRemediated.Name)
+	conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+	return ctrl.Result{Requeue: true}, nil
+}
+
+// canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
+// without loosing etcd quorum.
+//
+// The answer mostly depend on the existence of other failing members on top of the one being deleted, and according
+// to the etcd fault tolerance specification (see https://github.com/etcd-io/etcd/blob/master/Documentation/faq.md#what-is-failure-tolerance):
+// - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target
+//   cluster size after deletion is 2, fault tolerance 0)
+// - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target
+//   cluster size after deletion is 4, fault tolerance 1)
+// - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target
+//   cluster size after deletion is 6, fault tolerance 2)
+// - etc.
+//
+// NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers
+// ans well as reconcileControlPlaneConditions before this.
+func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) {
+	logger := r.Log.WithValues("namespace", controlPlane.KCP.Namespace, "kubeadmControlPlane", controlPlane.KCP.Name, "cluster", controlPlane.Cluster.Name)
+
+	workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, client.ObjectKey{
+		Namespace: controlPlane.Cluster.Namespace,
+		Name:      controlPlane.Cluster.Name,
+	})
+	if err != nil {
+		return false, errors.Wrapf(err, "failed to get client for workload cluster %s", controlPlane.Cluster.Name)
+	}
+
+	// Gets the etcd status
+
+	// This makes it possible to have a set of etcd members status different from the MHC unhealthy/unhealthy conditions.
+	etcdMembers, err := workloadCluster.EtcdMembers(ctx)
+	if err != nil {
+		return false, errors.Wrapf(err, "failed to get etcdStatus for workload cluster %s", controlPlane.Cluster.Name)
+	}
+
+	currentTotalMembers := len(etcdMembers)
+
+	logger.Info("etcd cluster before remediation",
+		"currentTotalMembers", currentTotalMembers,
+		"currentMembers", etcdMembers)
+
+	// The cluster MUST have at least 3 members, because this is the smallest cluster size that allows any etcd failure tolerance.
+	//
+	// NOTE: This should not happen given that we are checking the number of replicas before calling this method, however
+	// given that this could be destructive, this is an additional safeguard.
+	if currentTotalMembers < 3 {
+		logger.Info("etcd cluster with less of 3 members can't be safely remediated")
+		return false, nil
+	}
+
+	targetTotalMembers := currentTotalMembers - 1
+	targetQuorum := targetTotalMembers/2.0 + 1
+	targetUnhealthyMembers := 0
+
+	healthyMembers := []string{}
+	unhealthyMembers := []string{}
+	for _, etcdMember := range etcdMembers {
+		// Skip the machine to be deleted because it won't be part of the target etcd cluster.
+		if etcdMember == machineToBeRemediated.Name {
+			continue
+		}
+
+		// Search for the machine corresponding to the etcd member.
+		var machine *clusterv1.Machine
+		for _, m := range controlPlane.Machines {
+			if m.Status.NodeRef != nil && m.Status.NodeRef.Name == etcdMember {
+				machine = m
+				break
+			}
+		}
+
+		// If an etcd member does not have a corresponding machine, it is not possible to retrieve etcd member health
+		// so we are assuming the worst scenario and considering the member unhealthy.
+		//
+		// NOTE: This should not happen given that we are running reconcileEtcdMembers before calling this method.
+		if machine == nil {
+			logger.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "MemberName", etcdMember)
+			targetUnhealthyMembers++
+			unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (no machine)", etcdMember))
+			continue
+		}
+
+		// Check member health as reported by machine's health conditions
+		if !conditions.IsTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) {
+			targetUnhealthyMembers++
+			unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
+			continue
+		}
+
+		healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
+	}
+
+	logger.Info(fmt.Sprintf("etcd cluster projected after remediation of %s", machineToBeRemediated.Name),
+		"healthyMembers", healthyMembers,
+		"unhealthyMembers", unhealthyMembers,
+		"targetTotalMembers", targetTotalMembers,
+		"targetQuorum", targetQuorum,
+		"targetUnhealthyMembers", targetUnhealthyMembers,
+		"projectedQuorum", targetTotalMembers-targetUnhealthyMembers)
+	if targetTotalMembers-targetUnhealthyMembers >= targetQuorum {
+		return true, nil
+	}
+	return false, nil
+}