From 62bd28943d8a6f1703bc13f5927643d94d58c668 Mon Sep 17 00:00:00 2001 From: Ben Moss Date: Mon, 6 Jul 2020 16:39:25 +0000 Subject: [PATCH] comments, comments, comments --- controlplane/kubeadm/controllers/controller.go | 4 +++- .../kubeadm/internal/workload_cluster_etcd.go | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go index 194ef9a58d3d..e866bc893d1b 100644 --- a/controlplane/kubeadm/controllers/controller.go +++ b/controlplane/kubeadm/controllers/controller.go @@ -602,6 +602,8 @@ func (r *KubeadmControlPlaneReconciler) adoptOwnedSecrets(ctx context.Context, k } func (r *KubeadmControlPlaneReconciler) remediateUnhealthy(ctx context.Context, logger logr.Logger, controlPlane *internal.ControlPlane) error { + // No remediation while cluster is below remediation threshold, or if the cluster is scaling up + // This is to prevent multiple remediations from happening before replacements are created. if !controlPlane.RemediationAllowed() || !controlPlane.HasUnhealthyMachine() || controlPlane.Machines.Len() < int(*controlPlane.KCP.Spec.Replicas) { return nil } @@ -622,7 +624,7 @@ func (r *KubeadmControlPlaneReconciler) remediateUnhealthy(ctx context.Context, machine := controlPlane.UnhealthyMachines().Oldest() - if etcdStatus.FailureTolerance(machine) == 0 { + if etcdStatus.FailureTolerance(machine) <= 0 { logger.Info("cluster has no failure tolerance, skipping remediation") return nil } diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go index bebc6c3f40b9..e4fde01bc6d9 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go @@ -337,6 +337,11 @@ type EtcdMemberStatus struct { // FailureTolerance is the amount of members the etcd cluster can afford to // lose without losing quorum. +// It accepts a machine as an argument in order to answer whether this specific +// machine can be safely deleted. For example, a 3-machine cluster has a +// failure tolerance of 1, but if member A is unresponsive and we are +// considering deleting member B, this method will give a failure tolerance of +// 0. Given member A, it would return 1. // Ref: https://github.com/etcd-io/etcd/blob/master/Documentation/faq.md#what-is-failure-tolerance func (e *EtcdStatus) FailureTolerance(machine *clusterv1.Machine) int { var unresponsive int @@ -346,16 +351,29 @@ func (e *EtcdStatus) FailureTolerance(machine *clusterv1.Machine) int { } } + // Calculate the default cluster failure tolerance as number-of-members / 2 + 1. + // Subtract unresponsive members to represent that they are already potentially failed + // and shouldn't be counted as contributing towards quorum. defaultTolerance := len(e.Members) - (len(e.Members)/2.0 + 1) - unresponsive + // If a machine does not have a NodeRef we have no way of correlating it to + // an etcd member. It is unlikely that it is an etcd member yet, but either + // way we have no way of determining if it is responsive or not. if machine.Status.NodeRef == nil { return defaultTolerance } member, found := e.Members[machine.Status.NodeRef.Name] + // If the nodeRef isn't in the map this member is not running etcd for some + // reason, this is an edge case. + // If the member is responsive, use the normal tolerance calculation to + // determine if it is deletable. if !found || member.Responsive { return defaultTolerance } + // If this machine is unresponsive, add 1 to the default tolerance since we + // have already subtracted unresponsive machines from the default tolerance but removing this + // machine will not further reduce quorum. return defaultTolerance + 1 }