kubernetes-sigs · fabriziopandini · Oct 23, 2020 · CecileRobertMichon · Oct 26, 2020 · fabriziopandini
diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
@@ -384,34 +384,32 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
 	logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name)
 	logger.Info("Reconcile KubeadmControlPlane deletion")
 
+	// Gets all machines, not just control plane machines.
+	allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster))
+	if err != nil {
+		return ctrl.Result{}, err
+	}
+	ownedMachines := allMachines.Filter(machinefilters.OwnedMachines(kcp))
+
 	// Ignore the health check results here as well as the errors, health check functions are to set health related conditions on Machines.
 	// Errors may be due to not being able to get workload cluster nodes.
 	workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
 	if err != nil {
 		r.Log.V(2).Info("Cannot get remote client to workload cluster during delete reconciliation", "err", err.Error())
 	} else {
 		// Do a health check of the Control Plane components
-		_, err = workloadCluster.ControlPlaneIsHealthy(ctx)
-		if err != nil {
+		if err := workloadCluster.ControlPlaneIsHealthy(ctx, ownedMachines.UnsortedList()); err != nil {
 			// Do nothing
 			r.Log.V(2).Info("Control plane did not pass control plane health check during delete reconciliation", "err", err.Error())
 		}
 
 		// Do a health check of the etcd
-		_, err = workloadCluster.EtcdIsHealthy(ctx)
-		if err != nil {
+		if err := workloadCluster.EtcdIsHealthy(ctx, ownedMachines.UnsortedList()); err != nil {
 			// Do nothing
 			r.Log.V(2).Info("Control plane did not pass etcd health check during delete reconciliation", "err", err.Error())
 		}
 	}
 
-	// Gets all machines, not just control plane machines.
-	allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster))
-	if err != nil {
-		return ctrl.Result{}, err
-	}
-	ownedMachines := allMachines.Filter(machinefilters.OwnedMachines(kcp))
-
 	// If no control plane machines remain, remove the finalizer
 	if len(ownedMachines) == 0 {
 		controllerutil.RemoveFinalizer(kcp, controlplanev1.KubeadmControlPlaneFinalizer)
@@ -486,31 +484,23 @@ func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.
 
 	errList := []error{}
 
-	// Do a health check of the Control Plane components
-	checkResult, err := workloadCluster.ControlPlaneIsHealthy(ctx)
-	if err != nil {
-		errList = append(errList, errors.Wrap(err, "failed to pass control-plane health check"))
-	} else if err := checkResult.Aggregate(controlPlane); err != nil {
+	// Do a health check of the Control Plane components.
+	if err := workloadCluster.ControlPlaneIsHealthy(ctx, controlPlane.Machines.UnsortedList()); err != nil {
 		r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
 			"Waiting for control plane to pass control plane health check to continue reconciliation: %v", err)
 		errList = append(errList, errors.Wrap(err, "failed to pass control-plane health check"))
 	}
 
 	// If KCP should manage etcd, ensure etcd is healthy.
 	if controlPlane.IsEtcdManaged() {
-		checkResult, err := workloadCluster.EtcdIsHealthy(ctx)
-		if err != nil {
-			errList = append(errList, errors.Wrap(err, "failed to pass etcd health check"))
-		} else if err := checkResult.Aggregate(controlPlane); err != nil {
+		if err := workloadCluster.EtcdIsHealthy(ctx, controlPlane.Machines.UnsortedList()); err != nil {
 			errList = append(errList, errors.Wrap(err, "failed to pass etcd health check"))
 			r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
 				"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)
+
 			// If there are any etcd members that do not have corresponding nodes, remove them from etcd and from the kubeadm configmap.
 			// This will solve issues related to manual control-plane machine deletion.
-			workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
-			if err != nil {
-				errList = append(errList, errors.Wrap(err, "cannot get remote client to workload cluster"))
-			} else if err := workloadCluster.ReconcileEtcdMembers(ctx); err != nil {
+			if err := workloadCluster.ReconcileEtcdMembers(ctx); err != nil {
 				errList = append(errList, errors.Wrap(err, "failed attempt to remove potential hanging etcd members to pass etcd health check to continue reconciliation"))
 			}
 		}

diff --git a/controlplane/kubeadm/controllers/fakes_test.go b/controlplane/kubeadm/controllers/fakes_test.go
@@ -98,18 +98,18 @@ func (f fakeWorkloadCluster) UpdateKubeletConfigMap(ctx context.Context, version
 	return nil
 }
 
-func (f fakeWorkloadCluster) ControlPlaneIsHealthy(ctx context.Context) (internal.HealthCheckResult, error) {
+func (f fakeWorkloadCluster) ControlPlaneIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error {
 	if !f.ControlPlaneHealthy {
-		return nil, errors.New("control plane is not healthy")
+		return errors.New("control plane is not healthy")
 	}
-	return nil, nil
+	return nil
 }
 
-func (f fakeWorkloadCluster) EtcdIsHealthy(ctx context.Context) (internal.HealthCheckResult, error) {
+func (f fakeWorkloadCluster) EtcdIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error {
 	if !f.EtcdHealthy {
-		return nil, errors.New("etcd is not healthy")
+		return errors.New("etcd is not healthy")
 	}
-	return nil, nil
+	return nil
 }
 
 type fakeMigrator struct {

diff --git a/controlplane/kubeadm/internal/cluster_test.go b/controlplane/kubeadm/internal/cluster_test.go
@@ -96,6 +96,11 @@ func TestCheckStaticPodNotReadyCondition(t *testing.T) {
 func TestControlPlaneIsHealthy(t *testing.T) {
 	g := NewWithT(t)
 
+	machines := []*clusterv1.Machine{
+		controlPlaneMachine("first-control-plane"),
+		controlPlaneMachine("second-control-plane"),
+		controlPlaneMachine("third-control-plane"),
+	}
 	readyStatus := corev1.PodStatus{
 		Conditions: []corev1.PodCondition{
 			{
@@ -118,10 +123,8 @@ func TestControlPlaneIsHealthy(t *testing.T) {
 		},
 	}
 
-	health, err := workloadCluster.ControlPlaneIsHealthy(context.Background())
+	err := workloadCluster.ControlPlaneIsHealthy(context.Background(), machines)
 	g.Expect(err).NotTo(HaveOccurred())
-	g.Expect(health).NotTo(HaveLen(0))
-	g.Expect(health).To(HaveLen(len(nodeListForTestControlPlaneIsHealthy().Items)))
 }
 
 func TestGetMachinesForCluster(t *testing.T) {

diff --git a/controlplane/kubeadm/internal/workload_cluster.go b/controlplane/kubeadm/internal/workload_cluster.go
@@ -58,8 +58,8 @@ var (
 type WorkloadCluster interface {
 	// Basic health and status checks.
 	ClusterStatus(ctx context.Context) (ClusterStatus, error)
-	ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult, error)
-	EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
+	ControlPlaneIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error
+	EtcdIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error
 
 	// Upgrade related tasks.
 	ReconcileKubeletRBACBinding(ctx context.Context, version semver.Version) error
@@ -107,17 +107,16 @@ func (w *Workload) getConfigMap(ctx context.Context, configMap ctrlclient.Object
 	return original.DeepCopy(), nil
 }
 
-// HealthCheckResult maps nodes that are checked to any errors the node has related to the check.
-type HealthCheckResult map[string]error
+// checkResult maps nodes that are checked to any errors the node has related to the check.
+type checkResult map[string]error
 
-// Aggregate will analyse HealthCheckResult and report any errors discovered.
-// It also ensures there is a 1;1 match between nodes and machines.
-func (h HealthCheckResult) Aggregate(controlPlane *ControlPlane) error {
+// CompareWith compares the current health check result with the list of machines coming from the control plane.
+// Additionally, it checks that for each machine a node is present.
+func (c checkResult) CompareWith(machines []*clusterv1.Machine) error {
 	var errorList []error
-	kcpMachines := controlPlane.Machines.UnsortedList()
 	// Make sure Cluster API is aware of all the nodes.
 
-	for nodeName, err := range h {
+	for nodeName, err := range c {
 		if err != nil {
 			errorList = append(errorList, fmt.Errorf("node %q: %v", nodeName, err))
 		}
@@ -128,32 +127,32 @@ func (h HealthCheckResult) Aggregate(controlPlane *ControlPlane) error {
 
 	// This check ensures there is a 1 to 1 correspondence of nodes and machines.
 	// If a machine was not checked this is considered an error.
-	for _, machine := range kcpMachines {
+	for _, machine := range machines {
 		if machine.Status.NodeRef == nil {
 			// The condition for this case is set by the Machine controller
 			return errors.Errorf("control plane machine %s/%s has no status.nodeRef", machine.Namespace, machine.Name)
 		}
-		if _, ok := h[machine.Status.NodeRef.Name]; !ok {
+		if _, ok := c[machine.Status.NodeRef.Name]; !ok {
 			return errors.Errorf("machine's (%s/%s) node (%s) was not checked", machine.Namespace, machine.Name, machine.Status.NodeRef.Name)
 		}
 	}
-	if len(h) != len(kcpMachines) {
+	if len(c) != len(machines) {
 		// MachinesReadyCondition covers this health failure.
-		return errors.Errorf("number of nodes and machines in namespace %s did not match: %d nodes %d machines", controlPlane.Cluster.Namespace, len(h), len(kcpMachines))
+		return errors.Errorf("number of nodes and machines did not match: %d nodes %d machines", len(c), len(machines))
 	}
 	return nil
 }
 
 // controlPlaneIsHealthy does a best effort check of the control plane components the kubeadm control plane cares about.
 // The return map is a map of node names as keys to error that that node encountered.
 // All nodes will exist in the map with nil errors if there were no errors for that node.
-func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult, error) {
+func (w *Workload) ControlPlaneIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error {
 	controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
 	if err != nil {
-		return nil, err
+		return err
 	}
 
-	response := make(map[string]error)
+	response := make(checkResult)
 	for _, node := range controlPlaneNodes.Items {
 		name := node.Name
 		response[name] = nil
@@ -186,7 +185,7 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult
 		response[name] = checkStaticPodReadyCondition(controllerManagerPod)
 	}
 
-	return response, nil
+	return response.CompareWith(machines)
 }
 
 // UpdateKubernetesVersionInKubeadmConfigMap updates the kubernetes version in the kubeadm config map.

diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go
@@ -38,17 +38,17 @@ type etcdClientFor interface {
 // This is a best effort check and nodes can become unhealthy after the check is complete. It is not a guarantee.
 // It's used a signal for if we should allow a target cluster to scale up, scale down or upgrade.
 // It returns a map of nodes checked along with an error for a given node.
-func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error) {
+func (w *Workload) EtcdIsHealthy(ctx context.Context, machines []*clusterv1.Machine) error {
 	var knownClusterID uint64
 	var knownMemberIDSet etcdutil.UInt64Set
 
 	controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
 	if err != nil {
-		return nil, err
+		return err
 	}
 
 	expectedMembers := 0
-	response := make(map[string]error)
+	response := make(checkResult)
 	for _, node := range controlPlaneNodes.Items {
 		name := node.Name
 		response[name] = nil
@@ -127,10 +127,10 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
 	// Check that there is exactly one etcd member for every healthy pod.
 	// This allows us to handle the expected case where there is a failing pod but it's been removed from the member list.
 	if expectedMembers != len(knownMemberIDSet) {
-		return response, errors.Errorf("there are %d healthy etcd pods, but %d etcd members", expectedMembers, len(knownMemberIDSet))
+		return errors.Errorf("there are %d healthy etcd pods, but %d etcd members", expectedMembers, len(knownMemberIDSet))
 	}
 
-	return response, nil
+	return response.CompareWith(machines)
 }
 
 // ReconcileEtcdMembers iterates over all etcd members and finds members that do not have corresponding nodes.

diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go
@@ -39,6 +39,12 @@ import (
 func TestWorkload_EtcdIsHealthy(t *testing.T) {
 	g := NewWithT(t)
 
+	machines := []*clusterv1.Machine{
+		controlPlaneMachine("test-1"),
+		controlPlaneMachine("test-2"),
+		controlPlaneMachine("test-3"),
+		controlPlaneMachine("test-4"),
+	}
 	workload := &Workload{
 		Client: &fakeClient{
 			get: map[string]interface{}{
@@ -75,12 +81,8 @@ func TestWorkload_EtcdIsHealthy(t *testing.T) {
 		},
 	}
 	ctx := context.Background()
-	health, err := workload.EtcdIsHealthy(ctx)
+	err := workload.EtcdIsHealthy(ctx, machines)
 	g.Expect(err).NotTo(HaveOccurred())
-
-	for _, err := range health {
-		g.Expect(err).NotTo(HaveOccurred())
-	}
 }
 
 func TestUpdateEtcdVersionInKubeadmConfigMap(t *testing.T) {