From 62bd28943d8a6f1703bc13f5927643d94d58c668 Mon Sep 17 00:00:00 2001
From: Ben Moss <benm@vmware.com>
Date: Mon, 6 Jul 2020 16:39:25 +0000
Subject: [PATCH] comments, comments, comments

---
 controlplane/kubeadm/controllers/controller.go |  4 +++-
 .../kubeadm/internal/workload_cluster_etcd.go  | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index 194ef9a58d3d..e866bc893d1b 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -602,6 +602,8 @@ func (r *KubeadmControlPlaneReconciler) adoptOwnedSecrets(ctx context.Context, k
 }
 
 func (r *KubeadmControlPlaneReconciler) remediateUnhealthy(ctx context.Context, logger logr.Logger, controlPlane *internal.ControlPlane) error {
+	// No remediation while cluster is below remediation threshold, or if the cluster is scaling up
+	// This is to prevent multiple remediations from happening before replacements are created.
 	if !controlPlane.RemediationAllowed() || !controlPlane.HasUnhealthyMachine() || controlPlane.Machines.Len() < int(*controlPlane.KCP.Spec.Replicas) {
 		return nil
 	}
@@ -622,7 +624,7 @@ func (r *KubeadmControlPlaneReconciler) remediateUnhealthy(ctx context.Context,
 
 	machine := controlPlane.UnhealthyMachines().Oldest()
 
-	if etcdStatus.FailureTolerance(machine) == 0 {
+	if etcdStatus.FailureTolerance(machine) <= 0 {
 		logger.Info("cluster has no failure tolerance, skipping remediation")
 		return nil
 	}
diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go
index bebc6c3f40b9..e4fde01bc6d9 100644
--- a/controlplane/kubeadm/internal/workload_cluster_etcd.go
+++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go
@@ -337,6 +337,11 @@ type EtcdMemberStatus struct {
 
 // FailureTolerance is the amount of members the etcd cluster can afford to
 // lose without losing quorum.
+// It accepts a machine as an argument in order to answer whether this specific
+// machine can be safely deleted. For example, a 3-machine cluster has a
+// failure tolerance of 1, but if member A is unresponsive and we are
+// considering deleting member B, this method will give a failure tolerance of
+// 0. Given member A, it would return 1.
 // Ref: https://github.com/etcd-io/etcd/blob/master/Documentation/faq.md#what-is-failure-tolerance
 func (e *EtcdStatus) FailureTolerance(machine *clusterv1.Machine) int {
 	var unresponsive int
@@ -346,16 +351,29 @@ func (e *EtcdStatus) FailureTolerance(machine *clusterv1.Machine) int {
 		}
 	}
 
+	// Calculate the default cluster failure tolerance as number-of-members / 2 + 1.
+	// Subtract unresponsive members to represent that they are already potentially failed
+	// and shouldn't be counted as contributing towards quorum.
 	defaultTolerance := len(e.Members) - (len(e.Members)/2.0 + 1) - unresponsive
 
+	// If a machine does not have a NodeRef we have no way of correlating it to
+	// an etcd member. It is unlikely that it is an etcd member yet, but either
+	// way we have no way of determining if it is responsive or not.
 	if machine.Status.NodeRef == nil {
 		return defaultTolerance
 	}
 
 	member, found := e.Members[machine.Status.NodeRef.Name]
+	// If the nodeRef isn't in the map this member is not running etcd for some
+	// reason, this is an edge case.
+	// If the member is responsive, use the normal tolerance calculation to
+	// determine if it is deletable.
 	if !found || member.Responsive {
 		return defaultTolerance
 	}
 
+	// If this machine is unresponsive, add 1 to the default tolerance since we
+	// have already subtracted unresponsive machines from the default tolerance but removing this
+	// machine will not further reduce quorum.
 	return defaultTolerance + 1
 }