Skip to content

Commit

Permalink
Merge pull request #3830 from fabriziopandini/kcp-remediation
Browse files Browse the repository at this point in the history
✨ KCP remediates unhealthy machines
  • Loading branch information
k8s-ci-robot authored Nov 12, 2020
2 parents 24ac840 + b34a457 commit 24f10f8
Show file tree
Hide file tree
Showing 14 changed files with 1,043 additions and 20 deletions.
6 changes: 6 additions & 0 deletions api/v1alpha3/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,12 @@ const (
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
WaitingForRemediationReason = "WaitingForRemediation"

// RemediationFailedReason is the reason used when a remediation owner fails to remediate an unhealthy machine.
RemediationFailedReason = "RemediationFailed"

// RemediationInProgressReason is the reason used when an unhealthy machine is being remediated by the remediation owner.
RemediationInProgressReason = "RemediationInProgress"

// ExternalRemediationTemplateAvailable is set on machinehealthchecks when MachineHealthCheck controller uses external remediation.
// ExternalRemediationTemplateAvailable is set to false if external remediation template is not found.
ExternalRemediationTemplateAvailable ConditionType = "ExternalRemediationTemplateAvailable"
Expand Down
7 changes: 5 additions & 2 deletions controllers/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,13 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
// after provisioning - e.g. when a MHC condition exists - or during the deletion process).
conditions.SetSummary(machine,
conditions.WithConditions(
// Infrastructure problems should take precedence over all the other conditions
clusterv1.InfrastructureReadyCondition,
// Boostrap comes after, but it is relevant only during initial machine provisioning.
clusterv1.BootstrapReadyCondition,
clusterv1.MachineOwnerRemediatedCondition,
// MHC reported condition should take precedence over the remediation progress
clusterv1.MachineHealthCheckSuccededCondition,
clusterv1.MachineOwnerRemediatedCondition,
),
conditions.WithStepCounterIf(machine.ObjectMeta.DeletionTimestamp.IsZero()),
conditions.WithStepCounterIfOnly(
Expand All @@ -240,8 +243,8 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
clusterv1.BootstrapReadyCondition,
clusterv1.InfrastructureReadyCondition,
clusterv1.DrainingSucceededCondition,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.MachineHealthCheckSuccededCondition,
clusterv1.MachineOwnerRemediatedCondition,
}},
)

Expand Down
6 changes: 5 additions & 1 deletion controllers/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,11 @@ func (r *MachineHealthCheckReconciler) PatchUnhealthyTargets(ctx context.Context
}
} else {
r.Log.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "MachineHealthCheck failed")
// NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed;
// instead, if a remediation is in already progress, the remediation owner is responsible for completing the process and MHC should not overwrite the condition.
if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) {
conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
}
}
}

Expand Down
8 changes: 7 additions & 1 deletion controlplane/kubeadm/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *

// Aggregate the operational state of all the machines; while aggregating we are adding the
// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, controlPlane.Machines.ConditionGetters(), conditions.AddSourceRef())
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef(), conditions.WithStepCounterIf(false))

// Updates conditions reporting the status of static pods and the status of the etcd cluster.
// NOTE: Conditions reporting KCP operation progress like e.g. Resized or SpecUpToDate are inlined with the rest of the execution.
Expand All @@ -316,6 +316,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
return result, err
}

// Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate,
// otherwise continue with the other KCP operations.
if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() {
return result, err
}

// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
needRollout := controlPlane.MachinesNeedingRollout()
switch {
Expand Down
15 changes: 14 additions & 1 deletion controlplane/kubeadm/controllers/fakes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ func (f *fakeManagementCluster) GetMachinesForCluster(c context.Context, n clien

type fakeWorkloadCluster struct {
*internal.Workload
Status internal.ClusterStatus
Status internal.ClusterStatus
EtcdMembersResult []string
}

func (f fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, _ *clusterv1.Machine) error {
Expand Down Expand Up @@ -95,6 +96,18 @@ func (f fakeWorkloadCluster) UpdateKubeletConfigMap(ctx context.Context, version
return nil
}

func (f fakeWorkloadCluster) RemoveEtcdMemberForMachine(ctx context.Context, machine *clusterv1.Machine) error {
return nil
}

func (f fakeWorkloadCluster) RemoveMachineFromKubeadmConfigMap(ctx context.Context, machine *clusterv1.Machine) error {
return nil
}

func (f fakeWorkloadCluster) EtcdMembers(_ context.Context) ([]string, error) {
return f.EtcdMembersResult, nil
}

type fakeMigrator struct {
migrateCalled bool
migrateErr error
Expand Down
255 changes: 255 additions & 0 deletions controlplane/kubeadm/controllers/remediation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"context"
"fmt"

"github.com/pkg/errors"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// reconcileUnhealthyMachines tries to remediate KubeadmControlPlane unhealthy machines
// based on the process described in https://github.com/kubernetes-sigs/cluster-api/blob/master/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate
func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *internal.ControlPlane) (ret ctrl.Result, retErr error) {
logger := r.Log.WithValues("namespace", controlPlane.KCP.Namespace, "kubeadmControlPlane", controlPlane.KCP.Name, "cluster", controlPlane.Cluster.Name)

// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
// and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation.
unhealthyMachines := controlPlane.UnhealthyMachines()

// If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil).
if len(unhealthyMachines) == 0 {
return ctrl.Result{}, nil
}

// Select the machine to be remediated, which is the oldest machine marked as unhealthy.
//
// NOTE: The current solution is considered acceptable for the most frequent use case (only one unhealthy machine),
// however, in the future this could potentially be improved for the scenario where more than one unhealthy machine exists
// by considering which machine has lower impact on etcd quorum.
machineToBeRemediated := unhealthyMachines.Oldest()

// Returns if the machine is in the process of being deleted.
if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() {
return ctrl.Result{}, nil
}

patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
if err != nil {
return ctrl.Result{}, err
}

defer func() {
// Always attempt to Patch the Machine conditions after each reconcileUnhealthyMachines.
if err := patchHelper.Patch(ctx, machineToBeRemediated, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
clusterv1.MachineOwnerRemediatedCondition,
}}); err != nil {
logger.Error(err, "Failed to patch control plane Machine", "machine", machineToBeRemediated.Name)
if retErr == nil {
retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name)
}
}
}()

// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.

desiredReplicas := int(*controlPlane.KCP.Spec.Replicas)

// The cluster MUST have spec.replicas >= 3, because this is the smallest cluster size that allows any etcd failure tolerance.
if desiredReplicas < 3 {
logger.Info("A control plane machine needs remediation, but the number of desired replicas is less than 3. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas")
return ctrl.Result{}, nil
}

// The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster
// is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first.
if controlPlane.Machines.Len() < desiredReplicas {
logger.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas, "CurrentReplicas", controlPlane.Machines.Len())
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least %d control plane machines before triggering remediation", desiredReplicas)
return ctrl.Result{}, nil
}

// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
if controlPlane.HasDeletingMachine() {
logger.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
return ctrl.Result{}, nil
}

// Remediation MUST preserve etcd quorum. This rule ensures that we will not remove a member that would result in etcd
// losing a majority of members and thus become unable to field new requests.
if controlPlane.IsEtcdManaged() {
canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated)
if err != nil {
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, err
}
if !canSafelyRemediate {
logger.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
return ctrl.Result{}, nil
}
}

workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster))
if err != nil {
logger.Error(err, "Failed to create client to workload cluster")
return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
}

// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
if controlPlane.IsEtcdManaged() {
etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name)
return ctrl.Result{}, err
}
if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
logger.Error(err, "Failed to remove etcd member for machine")
return ctrl.Result{}, err
}
}

if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated); err != nil {
logger.Error(err, "Failed to remove machine from kubeadm ConfigMap")
return ctrl.Result{}, err
}

if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil {
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
}

logger.Info("Remediating unhealthy machine", "UnhealthyMachine", machineToBeRemediated.Name)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
return ctrl.Result{Requeue: true}, nil
}

// canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
// without loosing etcd quorum.
//
// The answer mostly depend on the existence of other failing members on top of the one being deleted, and according
// to the etcd fault tolerance specification (see https://github.com/etcd-io/etcd/blob/master/Documentation/faq.md#what-is-failure-tolerance):
// - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target
// cluster size after deletion is 2, fault tolerance 0)
// - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target
// cluster size after deletion is 4, fault tolerance 1)
// - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target
// cluster size after deletion is 6, fault tolerance 2)
// - etc.
//
// NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers
// ans well as reconcileControlPlaneConditions before this.
func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) {
logger := r.Log.WithValues("namespace", controlPlane.KCP.Namespace, "kubeadmControlPlane", controlPlane.KCP.Name, "cluster", controlPlane.Cluster.Name)

workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, client.ObjectKey{
Namespace: controlPlane.Cluster.Namespace,
Name: controlPlane.Cluster.Name,
})
if err != nil {
return false, errors.Wrapf(err, "failed to get client for workload cluster %s", controlPlane.Cluster.Name)
}

// Gets the etcd status

// This makes it possible to have a set of etcd members status different from the MHC unhealthy/unhealthy conditions.
etcdMembers, err := workloadCluster.EtcdMembers(ctx)
if err != nil {
return false, errors.Wrapf(err, "failed to get etcdStatus for workload cluster %s", controlPlane.Cluster.Name)
}

currentTotalMembers := len(etcdMembers)

logger.Info("etcd cluster before remediation",
"currentTotalMembers", currentTotalMembers,
"currentMembers", etcdMembers)

// The cluster MUST have at least 3 members, because this is the smallest cluster size that allows any etcd failure tolerance.
//
// NOTE: This should not happen given that we are checking the number of replicas before calling this method, however
// given that this could be destructive, this is an additional safeguard.
if currentTotalMembers < 3 {
logger.Info("etcd cluster with less of 3 members can't be safely remediated")
return false, nil
}

targetTotalMembers := currentTotalMembers - 1
targetQuorum := targetTotalMembers/2.0 + 1
targetUnhealthyMembers := 0

healthyMembers := []string{}
unhealthyMembers := []string{}
for _, etcdMember := range etcdMembers {
// Skip the machine to be deleted because it won't be part of the target etcd cluster.
if etcdMember == machineToBeRemediated.Name {
continue
}

// Search for the machine corresponding to the etcd member.
var machine *clusterv1.Machine
for _, m := range controlPlane.Machines {
if m.Status.NodeRef != nil && m.Status.NodeRef.Name == etcdMember {
machine = m
break
}
}

// If an etcd member does not have a corresponding machine, it is not possible to retrieve etcd member health
// so we are assuming the worst scenario and considering the member unhealthy.
//
// NOTE: This should not happen given that we are running reconcileEtcdMembers before calling this method.
if machine == nil {
logger.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "MemberName", etcdMember)
targetUnhealthyMembers++
unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (no machine)", etcdMember))
continue
}

// Check member health as reported by machine's health conditions
if !conditions.IsTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) {
targetUnhealthyMembers++
unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
continue
}

healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
}

logger.Info(fmt.Sprintf("etcd cluster projected after remediation of %s", machineToBeRemediated.Name),
"healthyMembers", healthyMembers,
"unhealthyMembers", unhealthyMembers,
"targetTotalMembers", targetTotalMembers,
"targetQuorum", targetQuorum,
"targetUnhealthyMembers", targetUnhealthyMembers,
"projectedQuorum", targetTotalMembers-targetUnhealthyMembers)
if targetTotalMembers-targetUnhealthyMembers >= targetQuorum {
return true, nil
}
return false, nil
}
Loading

0 comments on commit 24f10f8

Please sign in to comment.