Skip to content

Commit

Permalink
KCP remediates unhealthy machines
Browse files Browse the repository at this point in the history
  • Loading branch information
fabriziopandini committed Oct 26, 2020
1 parent 3886ffa commit 8d76ecc
Show file tree
Hide file tree
Showing 14 changed files with 908 additions and 4 deletions.
6 changes: 6 additions & 0 deletions api/v1alpha3/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,12 @@ const (
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
WaitingForRemediationReason = "WaitingForRemediation"

// RemediationFailedReason is the reason used when a remediation owner fails a remediate an unhealthy machine.
RemediationFailedReason = "RemediationFailed"

// RemediationInProgressReason is the reason used when an unhealthy machine is being remediated by the remediation owner.
RemediationInProgressReason = "RemediationInProgress"

// ExternalRemediationTemplateAvailable is set on machinehealthchecks when MachineHealthCheck controller uses external remediation.
// ExternalRemediationTemplateAvailable is set to false if external remediation template is not found.
ExternalRemediationTemplateAvailable ConditionType = "ExternalRemediationTemplateAvailable"
Expand Down
7 changes: 5 additions & 2 deletions controllers/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,13 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
// after provisioning - e.g. when a MHC condition exists - or during the deletion process).
conditions.SetSummary(machine,
conditions.WithConditions(
// Infrastructure problems should take precedence over all the other conditions
clusterv1.InfrastructureReadyCondition,
// Boostrap comes after, but it is relevant only during initial machine provisioning.
clusterv1.BootstrapReadyCondition,
clusterv1.MachineOwnerRemediatedCondition,
// MHC reported condition should take precedence over the remediation progress
clusterv1.MachineHealthCheckSuccededCondition,
clusterv1.MachineOwnerRemediatedCondition,
),
conditions.WithStepCounterIf(machine.ObjectMeta.DeletionTimestamp.IsZero()),
conditions.WithStepCounterIfOnly(
Expand All @@ -240,8 +243,8 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
clusterv1.BootstrapReadyCondition,
clusterv1.InfrastructureReadyCondition,
clusterv1.DrainingSucceededCondition,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.MachineHealthCheckSuccededCondition,
clusterv1.MachineOwnerRemediatedCondition,
}},
)

Expand Down
6 changes: 5 additions & 1 deletion controllers/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,11 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
}
} else {
logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "MachineHealthCheck failed")
// NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed;
// instead, if a remediation is in already progress, the remediation owner is responsible for completing the process and MHC should not override the condition.
if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) {
conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
}
}
}

Expand Down
8 changes: 7 additions & 1 deletion controlplane/kubeadm/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,13 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *

// Aggregate the operational state of all the machines; while aggregating we are adding the
// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, controlPlane.Machines.ConditionGetters(), conditions.AddSourceRef())
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef(), conditions.WithStepCounterIf(false))

// Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate,
// otherwise continue with the other MHC operations
if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() {
return result, err
}

// reconcileControlPlaneHealth returns err if there is a machine being deleted or if the control plane is unhealthy.
// If the control plane is not yet initialized, this call shouldn't fail.
Expand Down
5 changes: 5 additions & 0 deletions controlplane/kubeadm/controllers/fakes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ type fakeWorkloadCluster struct {
Status internal.ClusterStatus
ControlPlaneHealthy bool
EtcdHealthy bool
EtcdStatusResult []internal.EtcdMemberStatus
}

func (f fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, _ *clusterv1.Machine) error {
Expand Down Expand Up @@ -112,6 +113,10 @@ func (f fakeWorkloadCluster) EtcdIsHealthy(ctx context.Context) (internal.Health
return nil, nil
}

func (f fakeWorkloadCluster) EtcdStatus(ctx context.Context) ([]internal.EtcdMemberStatus, error) {
return f.EtcdStatusResult, nil
}

type fakeMigrator struct {
migrateCalled bool
migrateErr error
Expand Down
178 changes: 178 additions & 0 deletions controlplane/kubeadm/controllers/remediation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"context"

"github.com/pkg/errors"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// reconcileUnhealthyMachines handles KubeadmControlPlane unhealthyMachine reconciliation.
func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, c *internal.ControlPlane) (ret ctrl.Result, retErr error) {
logger := r.Log.WithValues("namespace", c.KCP.Namespace, "kubeadmControlPlane", c.KCP.Name, "cluster", c.Cluster.Name)

// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
// and `MachineOwnerRemediated` present, indicating that KCP is responsible of performing remediation as owner of the machine.
unhealthyMachines := c.UnhealthyMachines()

// If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil).
if len(unhealthyMachines) == 0 {
return ctrl.Result{}, nil
}

// Select the machine to be remediated, which is the oldest machine marked as unhealthy.
// NOTE: The current solution is considered acceptable for the most frequent use case (only one unhealthy machine),
// however, in the future this could potentially be improved for the scenario where more than one unhealthy machine exists
// by considering which machine has lower impact on etcd quorum.
machineToBeRemediated := unhealthyMachines.Oldest()

// Returns if the machine is already being remediated
if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() {
return ctrl.Result{}, nil
}

patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
if err != nil {
return ctrl.Result{}, err
}

defer func() {
// Always attempt to Patch the Machine conditions after each reconcileUnhealthyMachines.
if err := patchHelper.Patch(ctx, machineToBeRemediated, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
clusterv1.MachineOwnerRemediatedCondition,
}}); err != nil {
logger.Error(err, "Failed to patch control plane Machine", "machine", machineToBeRemediated.Name)
if retErr == nil {
retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name)
}
}
}()

// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
// If those checks are failing, we surface the reason why remediation is not happening into the MachineOwnerRemediated condition
// and then we return so KCP can proceed with other operations (ctrl.Result nil).

desiredReplicas := int(*c.KCP.Spec.Replicas)

// The cluster MUST have spec.replicas >= 3, because this is the smallest cluster size that allows any etcd failure tolerance.
if desiredReplicas < 3 {
logger.Info("A control plane machine needs remediation, but the number of desired replicas is less than 3. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas")
return ctrl.Result{}, nil
}

// The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster
// is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first.
if c.Machines.Len() < desiredReplicas {
logger.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas, "CurrentReplicas", c.Machines.Len())
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least %d control plane machines before triggering remediation", desiredReplicas)
return ctrl.Result{}, nil
}

// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
if c.HasDeletingMachine() {
logger.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
return ctrl.Result{}, nil
}

// Remediation MUST preserve etcd quorum. This rule ensures that we will not remove a member that would result in etcd
// losing a majority of members and thus become unable to field new requests.
if c.IsEtcdManaged() {
canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, c, machineToBeRemediated)
if err != nil {
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, err
}

if !canSafelyRemediate {
logger.Info("A control plane machine needs remediation, but removing this machine could result in etcd loosing quorum. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
return ctrl.Result{}, nil
}
}

if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil {
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
}

logger.Info("Remediating unhealthy machine", "UnhealthyMachine", machineToBeRemediated.Name)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
return ctrl.Result{Requeue: true}, nil
}

// canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
// without loosing etcd quorum.
//
// The answer mostly depend on the existence of other failing members on top of the one being deleted, and according
// to the etcd fault tolerance specification (see https://github.com/etcd-io/etcd/blob/master/Documentation/faq.md#what-is-failure-tolerance):
// - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target
// cluster size after deletion is 2, fault tolerance 0)
// - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target
// cluster size after deletion is 4, fault tolerance 1)
// - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target
// cluster size after deletion is 6, fault tolerance 2)
// - etc.
func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, c *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) {
logger := r.Log.WithValues("namespace", c.KCP.Namespace, "kubeadmControlPlane", c.KCP.Name, "cluster", c.Cluster.Name)

workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, client.ObjectKey{
Namespace: c.Cluster.Namespace,
Name: c.Cluster.Name,
})
if err != nil {
return false, errors.Wrapf(err, "failed to get client for workload cluster %s", c.Cluster.Name)
}

// Gets the etcd status
// NOTE: We are using etcd as a source of truth, because the MHC notion of machine unhealthy might be different than
// the etcd member healthy definition.
// This makes it possible to have a set of etcd members status different from the MHC unhealthy/unhealthy conditions.
etcdStatus, err := workloadCluster.EtcdStatus(ctx)
if err != nil {
return false, errors.Wrapf(err, "failed to get etcdStatus for workload cluster %s", c.Cluster.Name)
}

currentTotalMembers := len(etcdStatus)

targetTotalMembers := currentTotalMembers - 1
targetQuorum := targetTotalMembers/2.0 + 1
targetUnhealthyMembers := 0
for _, etcdMember := range etcdStatus {
// skip the machine to be deleted because it won't be part of the target etcd cluster
if etcdMember.Name == machineToBeRemediated.Name {
continue
}
if !etcdMember.Responsive {
logger.Info("An additional etcd member is reporting unhealthy status while remediation a machine", "MachineToBeRemediated", machineToBeRemediated.Name, "EtcdMember", etcdMember.Name)
targetUnhealthyMembers++
}
}

if targetTotalMembers-targetUnhealthyMembers >= targetQuorum {
return true, nil
}
return false, nil
}
Loading

0 comments on commit 8d76ecc

Please sign in to comment.