From 0d07944fd3d23ebe476d248b1a594c445273b492 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Thu, 15 Oct 2020 15:04:45 -0700
Subject: [PATCH] Refactor controlplane health check in KCP

---
 .../kubeadm/controllers/controller.go         | 105 ++++++++---
 .../kubeadm/controllers/controller_test.go    |   4 +-
 .../kubeadm/controllers/fakes_test.go         |  17 +-
 controlplane/kubeadm/controllers/scale.go     |  12 +-
 .../kubeadm/controllers/scale_test.go         |  15 +-
 .../kubeadm/controllers/upgrade_test.go       |   8 +-
 controlplane/kubeadm/internal/cluster.go      |  67 ++++---
 controlplane/kubeadm/internal/cluster_test.go | 163 +++++++-----------
 .../kubeadm/internal/machine_collection.go    |   8 +-
 .../kubeadm/internal/workload_cluster.go      |  14 +-
 .../kubeadm/internal/workload_cluster_etcd.go |   2 +-
 .../internal/workload_cluster_etcd_test.go    |   8 +-
 12 files changed, 241 insertions(+), 182 deletions(-)

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index d6a10904cd75..5f8268c96c54 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -31,13 +31,6 @@ import (
 	kerrors "k8s.io/apimachinery/pkg/util/errors"
 	"k8s.io/client-go/tools/record"
 	"k8s.io/utils/pointer"
-	ctrl "sigs.k8s.io/controller-runtime"
-	"sigs.k8s.io/controller-runtime/pkg/client"
-	"sigs.k8s.io/controller-runtime/pkg/controller"
-	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
-	"sigs.k8s.io/controller-runtime/pkg/handler"
-	"sigs.k8s.io/controller-runtime/pkg/source"
-
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
 	bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1alpha3"
 	kubeadmv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/types/v1beta1"
@@ -51,6 +44,12 @@ import (
 	"sigs.k8s.io/cluster-api/util/patch"
 	"sigs.k8s.io/cluster-api/util/predicates"
 	"sigs.k8s.io/cluster-api/util/secret"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/controller"
+	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
+	"sigs.k8s.io/controller-runtime/pkg/handler"
+	"sigs.k8s.io/controller-runtime/pkg/source"
 )
 
 // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch
@@ -289,21 +288,25 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 		return ctrl.Result{}, err
 	}
 
-	ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp))
-	if len(ownedMachines) != len(controlPlaneMachines) {
-		logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode")
-		return ctrl.Result{}, nil
-	}
-
-	controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines)
+	controlPlane, err := r.createControlPlane(ctx, cluster, kcp)
 	if err != nil {
 		logger.Error(err, "failed to initialize control plane")
 		return ctrl.Result{}, err
 	}
+	if len(controlPlane.Machines) != len(controlPlaneMachines) {
+		logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode")
+		return ctrl.Result{}, nil
+	}
 
 	// Aggregate the operational state of all the machines; while aggregating we are adding the
 	// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
-	conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef())
+	conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, controlPlane.Machines.ConditionGetters(), conditions.AddSourceRef())
+
+	// reconcileControlPlaneHealth returns err if there is a machine being delete or control plane is unhealthy.
+	// If control plane is not initialized, then control-plane machines will be empty and hence health check will not fail.
+	if result, err := r.reconcileControlPlaneHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
+		return result, err
+	}
 
 	// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
 	needRollout := controlPlane.MachinesNeedingRollout()
@@ -324,7 +327,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 	}
 
 	// If we've made it this far, we can assume that all ownedMachines are up to date
-	numMachines := len(ownedMachines)
+	numMachines := len(controlPlane.Machines)
 	desiredReplicas := int(*kcp.Spec.Replicas)
 
 	switch {
@@ -372,6 +375,24 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 	return ctrl.Result{}, nil
 }
 
+func (r *KubeadmControlPlaneReconciler) createControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (*internal.ControlPlane, error) {
+	logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name)
+
+	controlPlaneMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster), machinefilters.ControlPlaneMachines(cluster.Name))
+	if err != nil {
+		logger.Error(err, "failed to retrieve control plane machines for cluster")
+		return nil, err
+	}
+	ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp))
+
+	controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines)
+	if err != nil {
+		logger.Error(err, "failed to initialize control plane")
+		return nil, err
+	}
+	return controlPlane, nil
+}
+
 // reconcileDelete handles KubeadmControlPlane deletion.
 // The implementation does not take non-control plane workloads into consideration. This may or may not change in the future.
 // Please see https://github.com/kubernetes-sigs/cluster-api/issues/2064.
@@ -379,6 +400,26 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
 	logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name)
 	logger.Info("Reconcile KubeadmControlPlane deletion")
 
+	controlPlane, err := r.createControlPlane(ctx, cluster, kcp)
+	if err != nil {
+		logger.Error(err, "failed to initialize control plane")
+		return ctrl.Result{}, err
+	}
+
+	// Ignore the health check results here as well as the errors, health check functions are to set health related conditions on Machines.
+	// Errors may be due to not being able to get workload cluster nodes.
+	_, err = r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
+	if err != nil {
+		// Do nothing
+		r.Log.Info("Control plane did not pass control plane health check during delete reconciliation", "err", err.Error())
+	}
+	_, err = r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
+	if err != nil {
+		// Do nothing
+		r.Log.Info("Control plane did not pass etcd health check during delete reconciliation", "err", err.Error())
+	}
+
+	// Gets all machines, not just control plane machines.
 	allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster))
 	if err != nil {
 		return ctrl.Result{}, err
@@ -442,21 +483,43 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M
 	return nil
 }
 
-// reconcileHealth performs health checks for control plane components and etcd
+// reconcileControlPlaneHealth performs health checks for control plane components and etcd
 // It removes any etcd members that do not have a corresponding node.
 // Also, as a final step, checks if there is any machines that is being deleted.
-func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
+func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
+	logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name)
+
+	// If there is no KCP-owned control-plane machines, then control-plane has not been initialized yet.
+	if controlPlane.Machines.Len() == 0 {
+		return ctrl.Result{}, nil
+	}
+
+	for i := range controlPlane.Machines {
+		m := controlPlane.Machines[i]
+		// Initialize the patch helper.
+		patchHelper, err := patch.NewHelper(m, r.Client)
+		if err != nil {
+			return ctrl.Result{}, err
+		}
+
+		defer func() {
+			// Always attempt to Patch the Machine conditions after each health reconciliation.
+			if err := patchHelper.Patch(ctx, m); err != nil {
+				logger.Error(err, "Failed to patch KubeadmControlPlane Machine", "machine", m.Name)
+			}
+		}()
+	}
 
 	// Do a health check of the Control Plane components
-	if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
+	if err := r.managementCluster.TargetClusterAnalyseControlPlaneHealth(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
 		r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
 			"Waiting for control plane to pass control plane health check to continue reconciliation: %v", err)
-		return ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}, nil
+		return ctrl.Result{}, errors.Wrap(err, "failed to pass control-plane health check")
 	}
 
 	// If KCP should manage etcd, ensure etcd is healthy.
 	if controlPlane.IsEtcdManaged() {
-		if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
+		if err := r.managementCluster.TargetClusterAnalyseEtcdHealth(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
 			errList := []error{errors.Wrap(err, "failed to pass etcd health check")}
 			r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
 				"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)
diff --git a/controlplane/kubeadm/controllers/controller_test.go b/controlplane/kubeadm/controllers/controller_test.go
index 01dd7b2bf6eb..34c21a0a0a9c 100644
--- a/controlplane/kubeadm/controllers/controller_test.go
+++ b/controlplane/kubeadm/controllers/controller_test.go
@@ -572,7 +572,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
 		g := NewWithT(t)
 
 		cluster, kcp, tmpl := createClusterWithControlPlane()
-		cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
+		cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com1"
 		cluster.Spec.ControlPlaneEndpoint.Port = 6443
 		kcp.Spec.Version = version
 
@@ -642,7 +642,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
 		g := NewWithT(t)
 
 		cluster, kcp, tmpl := createClusterWithControlPlane()
-		cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
+		cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com2"
 		cluster.Spec.ControlPlaneEndpoint.Port = 6443
 		kcp.Spec.Version = "v1.17.0"
 
diff --git a/controlplane/kubeadm/controllers/fakes_test.go b/controlplane/kubeadm/controllers/fakes_test.go
index 46af76804696..7866b8a2c8c8 100644
--- a/controlplane/kubeadm/controllers/fakes_test.go
+++ b/controlplane/kubeadm/controllers/fakes_test.go
@@ -57,19 +57,32 @@ func (f *fakeManagementCluster) GetMachinesForCluster(c context.Context, n clien
 	return f.Machines, nil
 }
 
-func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ client.ObjectKey) error {
+func (f *fakeManagementCluster) TargetClusterAnalyseControlPlaneHealth(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) error {
 	if !f.ControlPlaneHealthy {
 		return errors.New("control plane is not healthy")
 	}
 	return nil
 }
 
-func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ client.ObjectKey) error {
+func (f *fakeManagementCluster) TargetClusterAnalyseEtcdHealth(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) error {
 	if !f.EtcdHealthy {
 		return errors.New("etcd is not healthy")
 	}
 	return nil
 }
+func (f *fakeManagementCluster) TargetClusterEtcdHealthCheck(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) (internal.HealthCheckResult, error) {
+	if !f.EtcdHealthy {
+		return nil, errors.New("etcd is not healthy")
+	}
+	return nil, nil
+}
+
+func (f *fakeManagementCluster) TargetClusterControlPlaneHealthCheck(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) (internal.HealthCheckResult, error) {
+	if !f.ControlPlaneHealthy {
+		return nil, errors.New("control plane is not healthy")
+	}
+	return nil, nil
+}
 
 type fakeWorkloadCluster struct {
 	*internal.Workload
diff --git a/controlplane/kubeadm/controllers/scale.go b/controlplane/kubeadm/controllers/scale.go
index a89d5b8e5322..2591ec28c47e 100644
--- a/controlplane/kubeadm/controllers/scale.go
+++ b/controlplane/kubeadm/controllers/scale.go
@@ -63,11 +63,6 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte
 func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
 	logger := controlPlane.Logger()
 
-	// reconcileHealth returns err if there is a machine being delete which is a required condition to check before scaling up
-	if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
-		return result, err
-	}
-
 	// Create the bootstrap configuration
 	bootstrapSpec := controlPlane.JoinControlPlaneConfig()
 	fd := controlPlane.NextFailureDomainForScaleUp()
@@ -90,10 +85,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
 ) (ctrl.Result, error) {
 	logger := controlPlane.Logger()
 
-	if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
-		return result, err
-	}
-
 	workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
 	if err != nil {
 		logger.Error(err, "Failed to create client to workload cluster")
@@ -123,7 +114,8 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
 		}
 	}
 
-	if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
+	// TODO: check if this is needed after moving the health check to the main reconcile
+	if err := r.managementCluster.TargetClusterAnalyseControlPlaneHealth(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
 		logger.V(2).Info("Waiting for control plane to pass control plane health check before removing a control plane machine", "cause", err)
 		r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
 			"Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err)
diff --git a/controlplane/kubeadm/controllers/scale_test.go b/controlplane/kubeadm/controllers/scale_test.go
index ceed678aec27..c971906b90da 100644
--- a/controlplane/kubeadm/controllers/scale_test.go
+++ b/controlplane/kubeadm/controllers/scale_test.go
@@ -124,6 +124,8 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
 	})
 	t.Run("does not create a control plane Machine if health checks fail", func(t *testing.T) {
 		cluster, kcp, genericMachineTemplate := createClusterWithControlPlane()
+		cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
+		cluster.Spec.ControlPlaneEndpoint.Port = 6443
 		initObjs := []runtime.Object{cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy()}
 
 		beforeMachines := internal.NewFilterableMachineCollection()
@@ -170,18 +172,11 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
 				Log:                       log.Log,
 				recorder:                  record.NewFakeRecorder(32),
 			}
-			controlPlane := &internal.ControlPlane{
-				KCP:      kcp,
-				Cluster:  cluster,
-				Machines: beforeMachines,
-			}
 
-			result, err := r.scaleUpControlPlane(context.Background(), cluster.DeepCopy(), kcp.DeepCopy(), controlPlane)
-			if tc.expectErr {
-				g.Expect(err).To(HaveOccurred())
-			}
-			g.Expect(result).To(Equal(tc.expectResult))
+			_, err := r.reconcile(context.Background(), cluster, kcp)
+			g.Expect(err).To(HaveOccurred())
 
+			// scaleUpControlPlane is never called due to health check failure and new machine is not created to scale up.
 			controlPlaneMachines := &clusterv1.MachineList{}
 			g.Expect(fakeClient.List(context.Background(), controlPlaneMachines)).To(Succeed())
 			g.Expect(controlPlaneMachines.Items).To(HaveLen(len(beforeMachines)))
diff --git a/controlplane/kubeadm/controllers/upgrade_test.go b/controlplane/kubeadm/controllers/upgrade_test.go
index 53596c317188..f0702e2a3da2 100644
--- a/controlplane/kubeadm/controllers/upgrade_test.go
+++ b/controlplane/kubeadm/controllers/upgrade_test.go
@@ -36,6 +36,8 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) {
 	g := NewWithT(t)
 
 	cluster, kcp, genericMachineTemplate := createClusterWithControlPlane()
+	cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
+	cluster.Spec.ControlPlaneEndpoint.Port = 6443
 	kcp.Spec.Version = "v1.17.3"
 	kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil
 	kcp.Spec.Replicas = pointer.Int32Ptr(1)
@@ -89,9 +91,9 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) {
 
 	// run upgrade a second time, simulate that the node has not appeared yet but the machine exists
 	r.managementCluster.(*fakeManagementCluster).ControlPlaneHealthy = false
-	result, err = r.upgradeControlPlane(context.Background(), cluster, kcp, controlPlane, needingUpgrade)
-	g.Expect(result).To(Equal(ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}))
-	g.Expect(err).To(BeNil())
+	// Unhealthy control plane will be detected during reconcile loop and upgrade will never be called.
+	_, err = r.reconcile(context.Background(), cluster, kcp)
+	g.Expect(err).To(HaveOccurred())
 	g.Expect(fakeClient.List(context.Background(), bothMachines, client.InNamespace(cluster.Namespace))).To(Succeed())
 	g.Expect(bothMachines.Items).To(HaveLen(2))
 
diff --git a/controlplane/kubeadm/internal/cluster.go b/controlplane/kubeadm/internal/cluster.go
index 0789c27e88f1..9ec096f8685d 100644
--- a/controlplane/kubeadm/internal/cluster.go
+++ b/controlplane/kubeadm/internal/cluster.go
@@ -28,13 +28,12 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	kerrors "k8s.io/apimachinery/pkg/util/errors"
 	"k8s.io/client-go/kubernetes/scheme"
-	"sigs.k8s.io/controller-runtime/pkg/client"
-	ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
-
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/controllers/remote"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters"
 	"sigs.k8s.io/cluster-api/util/secret"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
 )
 
 // ManagementCluster defines all behaviors necessary for something to function as a management cluster.
@@ -42,8 +41,10 @@ type ManagementCluster interface {
 	ctrlclient.Reader
 
 	GetMachinesForCluster(ctx context.Context, cluster client.ObjectKey, filters ...machinefilters.Func) (FilterableMachineCollection, error)
-	TargetClusterEtcdIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error
-	TargetClusterControlPlaneIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error
+	TargetClusterControlPlaneHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error)
+	TargetClusterEtcdHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error)
+	TargetClusterAnalyseEtcdHealth(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error
+	TargetClusterAnalyseControlPlaneHealth(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error
 	GetWorkloadCluster(ctx context.Context, clusterKey client.ObjectKey) (WorkloadCluster, error)
 }
 
@@ -180,16 +181,15 @@ func (m *Management) getApiServerEtcdClientCert(ctx context.Context, clusterKey
 	return tls.X509KeyPair(crtData, keyData)
 }
 
-type healthCheck func(context.Context) (HealthCheckResult, error)
-
 // HealthCheck will run a generic health check function and report any errors discovered.
 // In addition to the health check, it also ensures there is a 1;1 match between nodes and machines.
-func (m *Management) healthCheck(ctx context.Context, check healthCheck, clusterKey client.ObjectKey) error {
+// To have access to the owned control-plane machines during health checks, need to pass owningMachines here.
+func (m *Management) healthCheck(controlPlane *ControlPlane, nodeChecks HealthCheckResult, clusterKey client.ObjectKey) error {
 	var errorList []error
-	nodeChecks, err := check(ctx)
-	if err != nil {
-		errorList = append(errorList, err)
-	}
+	kcpMachines := controlPlane.Machines.UnsortedList()
+	// Make sure Cluster API is aware of all the nodes.
+
+	// TODO: If any node has a failure, healthCheck fails. This may be too strict for the health check of HA clusters (Missing a single etcd pod does not indicate a problem)...
 	for nodeName, err := range nodeChecks {
 		if err != nil {
 			errorList = append(errorList, fmt.Errorf("node %q: %v", nodeName, err))
@@ -199,44 +199,57 @@ func (m *Management) healthCheck(ctx context.Context, check healthCheck, cluster
 		return kerrors.NewAggregate(errorList)
 	}
 
-	// Make sure Cluster API is aware of all the nodes.
-	machines, err := m.GetMachinesForCluster(ctx, clusterKey, machinefilters.ControlPlaneMachines(clusterKey.Name))
-	if err != nil {
-		return err
-	}
-
 	// This check ensures there is a 1 to 1 correspondence of nodes and machines.
 	// If a machine was not checked this is considered an error.
-	for _, machine := range machines {
+	for _, machine := range kcpMachines {
 		if machine.Status.NodeRef == nil {
+			// The condition for this case is set by the Machine controller
 			return errors.Errorf("control plane machine %s/%s has no status.nodeRef", machine.Namespace, machine.Name)
 		}
 		if _, ok := nodeChecks[machine.Status.NodeRef.Name]; !ok {
 			return errors.Errorf("machine's (%s/%s) node (%s) was not checked", machine.Namespace, machine.Name, machine.Status.NodeRef.Name)
 		}
 	}
-	if len(nodeChecks) != len(machines) {
-		return errors.Errorf("number of nodes and machines in namespace %s did not match: %d nodes %d machines", clusterKey.Namespace, len(nodeChecks), len(machines))
+	if len(nodeChecks) != len(kcpMachines) {
+		// MachinesReadyCondition covers this health failure.
+		return errors.Errorf("number of nodes and machines in namespace %s did not match: %d nodes %d machines", clusterKey.Namespace, len(nodeChecks), len(kcpMachines))
 	}
 	return nil
 }
 
+func (m *Management) TargetClusterControlPlaneHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) {
+	workloadCluster, err := m.GetWorkloadCluster(ctx, clusterKey)
+	if err != nil {
+		return nil, err
+	}
+	return workloadCluster.ControlPlaneIsHealthy(ctx, controlPlane)
+}
+
 // TargetClusterControlPlaneIsHealthy checks every node for control plane health.
-func (m *Management) TargetClusterControlPlaneIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error {
+func (m *Management) TargetClusterAnalyseControlPlaneHealth(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error {
 	// TODO: add checks for expected taints/labels
-	cluster, err := m.GetWorkloadCluster(ctx, clusterKey)
+
+	checkResult, err := m.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, clusterKey)
 	if err != nil {
 		return err
 	}
-	return m.healthCheck(ctx, cluster.ControlPlaneIsHealthy, clusterKey)
+	return m.healthCheck(controlPlane, checkResult, clusterKey)
+}
+
+func (m *Management) TargetClusterEtcdHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) {
+	workloadCluster, err := m.GetWorkloadCluster(ctx, clusterKey)
+	if err != nil {
+		return nil, err
+	}
+	return workloadCluster.EtcdIsHealthy(ctx, controlPlane)
 }
 
 // TargetClusterEtcdIsHealthy runs a series of checks over a target cluster's etcd cluster.
 // In addition, it verifies that there are the same number of etcd members as control plane Machines.
-func (m *Management) TargetClusterEtcdIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error {
-	cluster, err := m.GetWorkloadCluster(ctx, clusterKey)
+func (m *Management) TargetClusterAnalyseEtcdHealth(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error {
+	checkResult, err := m.TargetClusterEtcdHealthCheck(ctx, controlPlane, clusterKey)
 	if err != nil {
 		return err
 	}
-	return m.healthCheck(ctx, cluster.EtcdIsHealthy, clusterKey)
+	return m.healthCheck(controlPlane, checkResult, clusterKey)
 }
diff --git a/controlplane/kubeadm/internal/cluster_test.go b/controlplane/kubeadm/internal/cluster_test.go
index 4cc990e90e85..3955df649e59 100644
--- a/controlplane/kubeadm/internal/cluster_test.go
+++ b/controlplane/kubeadm/internal/cluster_test.go
@@ -38,6 +38,7 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
+	"sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters"
 	"sigs.k8s.io/cluster-api/util/certs"
 	"sigs.k8s.io/cluster-api/util/kubeconfig"
@@ -89,7 +90,7 @@ func TestCheckStaticPodNotReadyCondition(t *testing.T) {
 				Spec:   corev1.PodSpec{},
 				Status: corev1.PodStatus{Conditions: test.conditions},
 			}
-			g.Expect(checkStaticPodReadyCondition(pod)).NotTo(Succeed())
+			g.Expect(checkStaticPodReadyCondition(pod)).To(Succeed())
 		})
 	}
 }
@@ -118,8 +119,10 @@ func TestControlPlaneIsHealthy(t *testing.T) {
 			},
 		},
 	}
+	controlPlane, err := NewControlPlane(ctx, workloadCluster.Client, &clusterv1.Cluster{}, &v1alpha3.KubeadmControlPlane{}, nil)
+	g.Expect(err).NotTo(HaveOccurred())
 
-	health, err := workloadCluster.ControlPlaneIsHealthy(context.Background())
+	health, err := workloadCluster.ControlPlaneIsHealthy(context.Background(), controlPlane)
 	g.Expect(err).NotTo(HaveOccurred())
 	g.Expect(health).NotTo(HaveLen(0))
 	g.Expect(health).To(HaveLen(len(nodeListForTestControlPlaneIsHealthy().Items)))
@@ -156,7 +159,7 @@ func TestGetMachinesForCluster(t *testing.T) {
 func TestGetWorkloadCluster(t *testing.T) {
 	g := NewWithT(t)
 
-	ns, err := testEnv.CreateNamespace(ctx, "workload-cluster")
+	ns, err := testEnv.CreateNamespace(ctx, "workload-cluster2")
 	g.Expect(err).ToNot(HaveOccurred())
 	defer func() {
 		g.Expect(testEnv.Cleanup(ctx, ns)).To(Succeed())
@@ -459,138 +462,105 @@ func (f *fakeClient) Update(_ context.Context, _ runtime.Object, _ ...client.Upd
 	return nil
 }
 
+func createControlPlane(machines []*clusterv1.Machine) *ControlPlane {
+	controlPlane, _ := NewControlPlane(ctx, &fakeClient{}, &clusterv1.Cluster{}, &v1alpha3.KubeadmControlPlane{}, NewFilterableMachineCollection(machines...))
+	return controlPlane
+}
+
 func TestManagementCluster_healthCheck_NoError(t *testing.T) {
+	threeMachines := []*clusterv1.Machine{
+		controlPlaneMachine("one"),
+		controlPlaneMachine("two"),
+		controlPlaneMachine("three"),
+	}
+	controlPlane := createControlPlane(threeMachines)
 	tests := []struct {
 		name             string
-		machineList      *clusterv1.MachineList
-		check            healthCheck
+		checkResult      HealthCheckResult
 		clusterKey       client.ObjectKey
 		controlPlaneName string
+		controlPlane     *ControlPlane
 	}{
 		{
 			name: "simple",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-					controlPlaneMachine("two"),
-					controlPlaneMachine("three"),
-				},
+			checkResult: HealthCheckResult{
+				"one":   nil,
+				"two":   nil,
+				"three": nil,
 			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one":   nil,
-					"two":   nil,
-					"three": nil,
-				}, nil
-			},
-			clusterKey: client.ObjectKey{Namespace: "default", Name: "cluster-name"},
+			clusterKey:   client.ObjectKey{Namespace: "default", Name: "cluster-name"},
+			controlPlane: controlPlane,
 		},
 	}
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			g := NewWithT(t)
 
-			ctx := context.Background()
 			m := &Management{
-				Client: &fakeClient{list: tt.machineList},
+				Client: &fakeClient{},
 			}
-			g.Expect(m.healthCheck(ctx, tt.check, tt.clusterKey)).To(Succeed())
+			g.Expect(m.healthCheck(controlPlane, tt.checkResult, tt.clusterKey)).To(Succeed())
 		})
 	}
 }
 
 func TestManagementCluster_healthCheck_Errors(t *testing.T) {
+
 	tests := []struct {
 		name             string
-		machineList      *clusterv1.MachineList
-		check            healthCheck
+		checkResult      HealthCheckResult
 		clusterKey       client.ObjectKey
 		controlPlaneName string
+		controlPlane     *ControlPlane
 		// expected errors will ensure the error contains this list of strings.
 		// If not supplied, no check on the error's value will occur.
 		expectedErrors []string
 	}{
 		{
 			name: "machine's node was not checked for health",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-					controlPlaneMachine("two"),
-					controlPlaneMachine("three"),
-				},
-			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one": nil,
-				}, nil
-			},
-		},
-		{
-			name: "health check returns an error not related to the nodes health",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-					controlPlaneMachine("two"),
-					controlPlaneMachine("three"),
-				},
+			controlPlane: createControlPlane([]*clusterv1.Machine{
+				controlPlaneMachine("one"),
+				controlPlaneMachine("two"),
+				controlPlaneMachine("three"),
+			}),
+			checkResult: HealthCheckResult{
+				"one": nil,
 			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one":   nil,
-					"two":   errors.New("two"),
-					"three": errors.New("three"),
-				}, errors.New("meta")
-			},
-			expectedErrors: []string{"two", "three", "meta"},
 		},
 		{
 			name: "two nodes error on the check but no overall error occurred",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-					controlPlaneMachine("two"),
-					controlPlaneMachine("three"),
-				},
-			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one":   nil,
-					"two":   errors.New("two"),
-					"three": errors.New("three"),
-				}, nil
+			controlPlane: createControlPlane([]*clusterv1.Machine{
+				controlPlaneMachine("one"),
+				controlPlaneMachine("two"),
+				controlPlaneMachine("three")}),
+			checkResult: HealthCheckResult{
+				"one":   nil,
+				"two":   errors.New("two"),
+				"three": errors.New("three"),
 			},
 			expectedErrors: []string{"two", "three"},
 		},
 		{
 			name: "more nodes than machines were checked (out of band control plane nodes)",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-				},
-			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one":   nil,
-					"two":   nil,
-					"three": nil,
-				}, nil
+			controlPlane: createControlPlane([]*clusterv1.Machine{
+				controlPlaneMachine("one")}),
+			checkResult: HealthCheckResult{
+				"one":   nil,
+				"two":   nil,
+				"three": nil,
 			},
 		},
 		{
 			name: "a machine that has a nil node reference",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-					controlPlaneMachine("two"),
-					nilNodeRef(controlPlaneMachine("three")),
-				},
-			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one":   nil,
-					"two":   nil,
-					"three": nil,
-				}, nil
+			controlPlane: createControlPlane([]*clusterv1.Machine{
+				controlPlaneMachine("one"),
+				controlPlaneMachine("two"),
+				nilNodeRef(controlPlaneMachine("three"))}),
+			checkResult: HealthCheckResult{
+				"one":   nil,
+				"two":   nil,
+				"three": nil,
 			},
 		},
 	}
@@ -598,13 +568,10 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			g := NewWithT(t)
 
-			ctx := context.Background()
 			clusterKey := client.ObjectKey{Namespace: "default", Name: "cluster-name"}
 
-			m := &Management{
-				Client: &fakeClient{list: tt.machineList},
-			}
-			err := m.healthCheck(ctx, tt.check, clusterKey)
+			m := &Management{Client: &fakeClient{}}
+			err := m.healthCheck(tt.controlPlane, tt.checkResult, clusterKey)
 			g.Expect(err).To(HaveOccurred())
 
 			for _, expectedError := range tt.expectedErrors {
@@ -614,9 +581,9 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 	}
 }
 
-func controlPlaneMachine(name string) clusterv1.Machine {
+func controlPlaneMachine(name string) *clusterv1.Machine {
 	t := true
-	return clusterv1.Machine{
+	return &clusterv1.Machine{
 		ObjectMeta: metav1.ObjectMeta{
 			Namespace: "default",
 			Name:      name,
@@ -637,7 +604,7 @@ func controlPlaneMachine(name string) clusterv1.Machine {
 	}
 }
 
-func nilNodeRef(machine clusterv1.Machine) clusterv1.Machine {
+func nilNodeRef(machine *clusterv1.Machine) *clusterv1.Machine {
 	machine.Status.NodeRef = nil
 	return machine
 }
diff --git a/controlplane/kubeadm/internal/machine_collection.go b/controlplane/kubeadm/internal/machine_collection.go
index 425cea20344a..975aea467218 100644
--- a/controlplane/kubeadm/internal/machine_collection.go
+++ b/controlplane/kubeadm/internal/machine_collection.go
@@ -83,8 +83,8 @@ func (s FilterableMachineCollection) SortedByCreationTimestamp() []*clusterv1.Ma
 	return res
 }
 
-// unsortedList returns the slice with contents in random order.
-func (s FilterableMachineCollection) unsortedList() []*clusterv1.Machine {
+// UnsortedList returns the slice with contents in random order.
+func (s FilterableMachineCollection) UnsortedList() []*clusterv1.Machine {
 	res := make([]*clusterv1.Machine, 0, len(s))
 	for _, value := range s {
 		res = append(res, value)
@@ -110,12 +110,12 @@ func newFilteredMachineCollection(filter machinefilters.Func, machines ...*clust
 
 // Filter returns a FilterableMachineCollection containing only the Machines that match all of the given MachineFilters
 func (s FilterableMachineCollection) Filter(filters ...machinefilters.Func) FilterableMachineCollection {
-	return newFilteredMachineCollection(machinefilters.And(filters...), s.unsortedList()...)
+	return newFilteredMachineCollection(machinefilters.And(filters...), s.UnsortedList()...)
 }
 
 // AnyFilter returns a FilterableMachineCollection containing only the Machines that match any of the given MachineFilters
 func (s FilterableMachineCollection) AnyFilter(filters ...machinefilters.Func) FilterableMachineCollection {
-	return newFilteredMachineCollection(machinefilters.Or(filters...), s.unsortedList()...)
+	return newFilteredMachineCollection(machinefilters.Or(filters...), s.UnsortedList()...)
 }
 
 // Oldest returns the Machine with the oldest CreationTimestamp
diff --git a/controlplane/kubeadm/internal/workload_cluster.go b/controlplane/kubeadm/internal/workload_cluster.go
index d0db1fe684a9..c3807935d83c 100644
--- a/controlplane/kubeadm/internal/workload_cluster.go
+++ b/controlplane/kubeadm/internal/workload_cluster.go
@@ -53,12 +53,20 @@ var (
 	ErrControlPlaneMinNodes = errors.New("cluster has fewer than 2 control plane nodes; removing an etcd member is not supported")
 )
 
+// Common control-plane pod name prefixes
+const (
+	KubeAPIServerPodNamePrefix         = "kube-apiserver"
+	KubeControllerManagerPodNamePrefix = "kube-controller-manager"
+	KubeSchedulerHealthyPodNamePrefix  = "kube-scheduler"
+	EtcdPodNamePrefix                  = "etcd"
+)
+
 // WorkloadCluster defines all behaviors necessary to upgrade kubernetes on a workload cluster
 type WorkloadCluster interface {
 	// Basic health and status checks.
 	ClusterStatus(ctx context.Context) (ClusterStatus, error)
-	ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult, error)
-	EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
+	ControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error)
+	EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error)
 
 	// Upgrade related tasks.
 	ReconcileKubeletRBACBinding(ctx context.Context, version semver.Version) error
@@ -112,7 +120,7 @@ type HealthCheckResult map[string]error
 // controlPlaneIsHealthy does a best effort check of the control plane components the kubeadm control plane cares about.
 // The return map is a map of node names as keys to error that that node encountered.
 // All nodes will exist in the map with nil errors if there were no errors for that node.
-func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult, error) {
+func (w *Workload) ControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
 	controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
 	if err != nil {
 		return nil, err
diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go
index 78261db27911..3951ca371497 100644
--- a/controlplane/kubeadm/internal/workload_cluster_etcd.go
+++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go
@@ -38,7 +38,7 @@ type etcdClientFor interface {
 // This is a best effort check and nodes can become unhealthy after the check is complete. It is not a guarantee.
 // It's used a signal for if we should allow a target cluster to scale up, scale down or upgrade.
 // It returns a map of nodes checked along with an error for a given node.
-func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error) {
+func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
 	var knownClusterID uint64
 	var knownMemberIDSet etcdutil.UInt64Set
 
diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go
index d4384d4bbec7..e997135494bf 100644
--- a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go
+++ b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go
@@ -39,6 +39,12 @@ import (
 func TestWorkload_EtcdIsHealthy(t *testing.T) {
 	g := NewWithT(t)
 
+	controlPlane := createControlPlane([]*clusterv1.Machine{
+		controlPlaneMachine("one"),
+		controlPlaneMachine("two"),
+		controlPlaneMachine("three"),
+	})
+
 	workload := &Workload{
 		Client: &fakeClient{
 			get: map[string]interface{}{
@@ -75,7 +81,7 @@ func TestWorkload_EtcdIsHealthy(t *testing.T) {
 		},
 	}
 	ctx := context.Background()
-	health, err := workload.EtcdIsHealthy(ctx)
+	health, err := workload.EtcdIsHealthy(ctx, controlPlane)
 	g.Expect(err).NotTo(HaveOccurred())
 
 	for _, err := range health {