From 1b7d9ea0ece3d70e94cdd17e7bf4ff5278d5078a Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Mon, 21 Sep 2020 23:31:11 -0700
Subject: [PATCH 01/12] Add Machine and KCP conditions to KCP controller

---
 api/v1alpha3/condition_consts.go              |  54 +++++-
 .../kubeadm/api/v1alpha3/condition_consts.go  |  11 ++
 .../kubeadm/controllers/controller.go         |  78 +++++++-
 .../kubeadm/controllers/controller_test.go    |   4 +-
 .../kubeadm/controllers/fakes_test.go         |   4 +-
 controlplane/kubeadm/controllers/scale.go     |  12 +-
 .../kubeadm/controllers/scale_test.go         |  17 +-
 controlplane/kubeadm/controllers/status.go    |   3 +-
 .../kubeadm/controllers/upgrade_test.go       |   8 +-
 controlplane/kubeadm/internal/cluster.go      |  43 +++--
 controlplane/kubeadm/internal/cluster_test.go | 118 ++++++------
 .../kubeadm/internal/control_plane.go         |  16 +-
 .../kubeadm/internal/etcd_client_generator.go |   3 +-
 .../kubeadm/internal/machine_collection.go    |   8 +-
 .../kubeadm/internal/workload_cluster.go      | 181 ++++++++++++++++--
 .../kubeadm/internal/workload_cluster_etcd.go |  60 ++++--
 .../internal/workload_cluster_etcd_test.go    |   8 +-
 17 files changed, 466 insertions(+), 162 deletions(-)

diff --git a/api/v1alpha3/condition_consts.go b/api/v1alpha3/condition_consts.go
index d75d0fc78cea..9b85e76aff9b 100644
--- a/api/v1alpha3/condition_consts.go
+++ b/api/v1alpha3/condition_consts.go
@@ -109,7 +109,8 @@ const (
 	// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
 	MachineHasFailureReason = "MachineHasFailure"
 
-	// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
+	// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
+	// NB. provisioned --> NodeRef != ""
 	NodeNotFoundReason = "NodeNotFound"
 
 	// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
@@ -120,10 +121,59 @@ const (
 )
 
 const (
-	// MachineOwnerRemediatedCondition is set on machines that have failed a healthcheck by the MachineHealthCheck controller.
+	// MachineOwnerRemediatedCondition is set on machines that have failed a healthcheck by the Machine's owner controller.
 	// MachineOwnerRemediatedCondition is set to False after a health check fails, but should be changed to True by the owning controller after remediation succeeds.
 	MachineOwnerRemediatedCondition ConditionType = "OwnerRemediated"
 
 	// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
 	WaitingForRemediationReason = "WaitingForRemediation"
 )
+
+// Common Pod-related Condition Reasons used by Pod-related Conditions such as MachineAPIServerPodHealthyCondition etc.
+const (
+	// PodProvisioningReason (Severity=Info) documents a pod waiting  to be provisioned i.e., Pod is in "Pending" phase and
+	// PodScheduled and Initialized conditions are not yet set to True.
+	PodProvisioningReason = "PodProvisioning"
+
+	// PodMissingReason (Severity=Warning) documents a pod does not exist.
+	PodMissingReason = "PodMissing"
+
+	// PodFailedReason (Severity=Error) documents if
+	// i) a pod failed during provisioning i.e., Pod is in "Pending" phase and
+	// PodScheduled and Initialized conditions are set to True but ContainersReady or Ready condition is false
+	// (i.e., at least one of the containers are in waiting state(e.g CrashLoopbackOff, ImagePullBackOff)
+	// ii) a pod has at least one container that is terminated with a failure and hence Pod is in "Failed" phase.
+	PodFailedReason = "PodFailed"
+)
+
+// Conditions that are only for control-plane machines. KubeadmControlPlane is the owner of these conditions.
+
+const (
+	// MachineAPIServerPodHealthyCondition reports a machine's kube-apiserver's health status.
+	// Set to true if kube-apiserver pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
+	MachineAPIServerPodHealthyCondition ConditionType = "APIServerPodHealthy"
+
+	// MachineControllerManagerHealthyCondition reports a machine's kube-controller-manager's health status.
+	// Set to true if kube-controller-manager pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
+	MachineControllerManagerHealthyCondition ConditionType = "ControllerManagerPodHealthy"
+
+	// MachineSchedulerPodHealthyCondition reports a machine's kube-scheduler's health status.
+	// Set to true if kube-scheduler pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
+	MachineSchedulerPodHealthyCondition ConditionType = "SchedulerPodHealthy"
+
+	// MachineEtcdPodHealthyCondition reports a machine's etcd pod's health status.
+	// Set to true if etcd pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
+	MachineEtcdPodHealthyCondition ConditionType = "EtcdPodHealthy"
+)
+
+const (
+	// MachineEtcdMemberHealthyCondition documents if the machine has an healthy etcd member.
+	// If not true, Pod-related Condition Reasons can be used as reasons.
+	MachineEtcdMemberHealthyCondition ConditionType = "EtcdMemberHealthy"
+
+	// EtcdMemberUnhealthyReason (Severity=Error) documents a Machine's etcd member is unhealthy for a number of reasons:
+	// i) etcd member has alarms.
+	// ii) creating etcd client fails or using the created etcd client to perform some operations fails.
+	// iii) Quorum is lost
+	EtcdMemberUnhealthyReason = "EtcdMemberUnhealthy"
+)
diff --git a/controlplane/kubeadm/api/v1alpha3/condition_consts.go b/controlplane/kubeadm/api/v1alpha3/condition_consts.go
index bf512965e6cc..ea7ece99ec5c 100644
--- a/controlplane/kubeadm/api/v1alpha3/condition_consts.go
+++ b/controlplane/kubeadm/api/v1alpha3/condition_consts.go
@@ -66,3 +66,14 @@ const (
 	// ScalingDownReason (Severity=Info) documents a KubeadmControlPlane that is decreasing the number of replicas.
 	ScalingDownReason = "ScalingDown"
 )
+
+const (
+	// EtcdClusterHealthy documents the overall etcd cluster's health for the KCP-managed etcd.
+	EtcdClusterHealthy clusterv1.ConditionType = "EtcdClusterHealthy"
+
+	// EtcdClusterUnhealthyReason (Severity=Warning) is set when the etcd cluster as unhealthy due to
+	// i) if etcd cluster has lost its quorum.
+	// ii) if etcd cluster has alarms armed.
+	// iii) if etcd pods do not match with etcd members.
+	EtcdClusterUnhealthyReason = "EtcdClusterUnhealthy"
+)
diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index d6a10904cd75..081853892fee 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -212,6 +212,27 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(req ctrl.Request) (res ctrl.Re
 	return r.reconcile(ctx, cluster, kcp)
 }
 
+// setSingleMachineConditions updates the machine's conditions according to health tracker.
+func setSingleMachineConditions(machine *clusterv1.Machine, controlPlane *internal.ControlPlane) {
+	for condType, condition := range controlPlane.MachineConditions[machine.Name] {
+		doesConditionExist := false
+		for _, mCondition := range machine.Status.Conditions {
+			// If the condition already exists, change the condition.
+			if mCondition.Type == condType {
+				conditions.Set(machine, condition)
+				doesConditionExist = true
+			}
+		}
+		if !doesConditionExist {
+			if machine.Status.Conditions == nil {
+				machine.Status.Conditions = clusterv1.Conditions{}
+			}
+			conditions.Set(machine, condition)
+		}
+
+	}
+}
+
 func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kcp *controlplanev1.KubeadmControlPlane) error {
 	// Always update the readyCondition by summarizing the state of other conditions.
 	conditions.SetSummary(kcp,
@@ -221,6 +242,7 @@ func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kc
 			controlplanev1.MachinesReadyCondition,
 			controlplanev1.AvailableCondition,
 			controlplanev1.CertificatesAvailableCondition,
+			controlplanev1.EtcdClusterHealthy,
 		),
 	)
 
@@ -305,6 +327,14 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 	// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
 	conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef())
 
+	// If control plane is initialized, reconcile health.
+	if ownedMachines.Len() != 0 {
+		// reconcileControlPlaneHealth returns err if there is a machine being delete
+		if result, err := r.reconcileControlPlaneHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
+			return result, err
+		}
+	}
+
 	// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
 	needRollout := controlPlane.MachinesNeedingRollout()
 	switch {
@@ -442,21 +472,59 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M
 	return nil
 }
 
-// reconcileHealth performs health checks for control plane components and etcd
+func patchControlPlaneMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine) error {
+	// Patch the object, ignoring conflicts on the conditions owned by this controller.
+
+	// TODO: Is it okay to own these conditions or just patch?
+	// return patchHelper.Patch(ctx, machine)
+
+	return patchHelper.Patch(
+		ctx,
+		machine,
+		patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
+			clusterv1.MachineAPIServerPodHealthyCondition,
+			clusterv1.MachineControllerManagerHealthyCondition,
+			clusterv1.MachineEtcdMemberHealthyCondition,
+			clusterv1.MachineEtcdPodHealthyCondition,
+			clusterv1.MachineSchedulerPodHealthyCondition,
+		}},
+	)
+}
+
+// reconcileControlPlaneHealth performs health checks for control plane components and etcd
 // It removes any etcd members that do not have a corresponding node.
 // Also, as a final step, checks if there is any machines that is being deleted.
-func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
+func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
+	logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name)
+
+	for _, m := range controlPlane.Machines {
+		// Initialize the patch helper.
+		patchHelper, err := patch.NewHelper(m, r.Client)
+		if err != nil {
+			logger.Error(err, "Failed to configure the patch helper")
+			return ctrl.Result{Requeue: true}, nil
+		}
+
+		machine := m
+		defer func() {
+			setSingleMachineConditions(machine, controlPlane)
+			// Always attempt to Patch the Machine conditions after each health reconciliation.
+			if err := patchControlPlaneMachine(ctx, patchHelper, machine); err != nil {
+				logger.Error(err, "Failed to patch KubeadmControlPlane Machine")
+			}
+		}()
+	}
 
 	// Do a health check of the Control Plane components
-	if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
+	if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
 		r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
 			"Waiting for control plane to pass control plane health check to continue reconciliation: %v", err)
-		return ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}, nil
+		return ctrl.Result{}, errors.Wrap(err, "failed to pass control-plane health check")
 	}
 
 	// If KCP should manage etcd, ensure etcd is healthy.
 	if controlPlane.IsEtcdManaged() {
-		if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
+		if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
 			errList := []error{errors.Wrap(err, "failed to pass etcd health check")}
 			r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
 				"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)
diff --git a/controlplane/kubeadm/controllers/controller_test.go b/controlplane/kubeadm/controllers/controller_test.go
index 01dd7b2bf6eb..6085113831e5 100644
--- a/controlplane/kubeadm/controllers/controller_test.go
+++ b/controlplane/kubeadm/controllers/controller_test.go
@@ -572,7 +572,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
 		g := NewWithT(t)
 
 		cluster, kcp, tmpl := createClusterWithControlPlane()
-		cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
+		cluster.Spec.ControlPlaneEndpoint.Host = "nodomain2.example.com"
 		cluster.Spec.ControlPlaneEndpoint.Port = 6443
 		kcp.Spec.Version = version
 
@@ -642,7 +642,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
 		g := NewWithT(t)
 
 		cluster, kcp, tmpl := createClusterWithControlPlane()
-		cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
+		cluster.Spec.ControlPlaneEndpoint.Host = "nodomain3.example.com"
 		cluster.Spec.ControlPlaneEndpoint.Port = 6443
 		kcp.Spec.Version = "v1.17.0"
 
diff --git a/controlplane/kubeadm/controllers/fakes_test.go b/controlplane/kubeadm/controllers/fakes_test.go
index 46af76804696..fce1de771386 100644
--- a/controlplane/kubeadm/controllers/fakes_test.go
+++ b/controlplane/kubeadm/controllers/fakes_test.go
@@ -57,14 +57,14 @@ func (f *fakeManagementCluster) GetMachinesForCluster(c context.Context, n clien
 	return f.Machines, nil
 }
 
-func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ client.ObjectKey) error {
+func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) error {
 	if !f.ControlPlaneHealthy {
 		return errors.New("control plane is not healthy")
 	}
 	return nil
 }
 
-func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ client.ObjectKey) error {
+func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) error {
 	if !f.EtcdHealthy {
 		return errors.New("etcd is not healthy")
 	}
diff --git a/controlplane/kubeadm/controllers/scale.go b/controlplane/kubeadm/controllers/scale.go
index a89d5b8e5322..51e7a305caf1 100644
--- a/controlplane/kubeadm/controllers/scale.go
+++ b/controlplane/kubeadm/controllers/scale.go
@@ -63,11 +63,6 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte
 func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
 	logger := controlPlane.Logger()
 
-	// reconcileHealth returns err if there is a machine being delete which is a required condition to check before scaling up
-	if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
-		return result, err
-	}
-
 	// Create the bootstrap configuration
 	bootstrapSpec := controlPlane.JoinControlPlaneConfig()
 	fd := controlPlane.NextFailureDomainForScaleUp()
@@ -90,10 +85,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
 ) (ctrl.Result, error) {
 	logger := controlPlane.Logger()
 
-	if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
-		return result, err
-	}
-
 	workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
 	if err != nil {
 		logger.Error(err, "Failed to create client to workload cluster")
@@ -123,7 +114,8 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
 		}
 	}
 
-	if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
+	// TODO: check if this is needed after moving the health check to the main reconcile
+	if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
 		logger.V(2).Info("Waiting for control plane to pass control plane health check before removing a control plane machine", "cause", err)
 		r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
 			"Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err)
diff --git a/controlplane/kubeadm/controllers/scale_test.go b/controlplane/kubeadm/controllers/scale_test.go
index ceed678aec27..de19e72a3559 100644
--- a/controlplane/kubeadm/controllers/scale_test.go
+++ b/controlplane/kubeadm/controllers/scale_test.go
@@ -116,7 +116,7 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
 
 		result, err := r.scaleUpControlPlane(context.Background(), cluster, kcp, controlPlane)
 		g.Expect(result).To(Equal(ctrl.Result{Requeue: true}))
-		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(err).NotTo(HaveOccurred())
 
 		controlPlaneMachines := clusterv1.MachineList{}
 		g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed())
@@ -124,6 +124,8 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
 	})
 	t.Run("does not create a control plane Machine if health checks fail", func(t *testing.T) {
 		cluster, kcp, genericMachineTemplate := createClusterWithControlPlane()
+		cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
+		cluster.Spec.ControlPlaneEndpoint.Port = 6443
 		initObjs := []runtime.Object{cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy()}
 
 		beforeMachines := internal.NewFilterableMachineCollection()
@@ -170,18 +172,11 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
 				Log:                       log.Log,
 				recorder:                  record.NewFakeRecorder(32),
 			}
-			controlPlane := &internal.ControlPlane{
-				KCP:      kcp,
-				Cluster:  cluster,
-				Machines: beforeMachines,
-			}
 
-			result, err := r.scaleUpControlPlane(context.Background(), cluster.DeepCopy(), kcp.DeepCopy(), controlPlane)
-			if tc.expectErr {
-				g.Expect(err).To(HaveOccurred())
-			}
-			g.Expect(result).To(Equal(tc.expectResult))
+			_, err := r.reconcile(context.Background(), cluster, kcp)
+			g.Expect(err).To(HaveOccurred())
 
+			// scaleUpControlPlane is never called due to health check failure and new machine is not created to scale up.
 			controlPlaneMachines := &clusterv1.MachineList{}
 			g.Expect(fakeClient.List(context.Background(), controlPlaneMachines)).To(Succeed())
 			g.Expect(controlPlaneMachines.Items).To(HaveLen(len(beforeMachines)))
diff --git a/controlplane/kubeadm/controllers/status.go b/controlplane/kubeadm/controllers/status.go
index 0c6b0ceac13a..6227ca00637b 100644
--- a/controlplane/kubeadm/controllers/status.go
+++ b/controlplane/kubeadm/controllers/status.go
@@ -18,7 +18,6 @@ package controllers
 
 import (
 	"context"
-
 	"github.com/pkg/errors"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
 	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
@@ -29,7 +28,7 @@ import (
 )
 
 // updateStatus is called after every reconcilitation loop in a defer statement to always make sure we have the
-// resource status subresourcs up-to-date.
+// resource status subresources up-to-date.
 func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane, cluster *clusterv1.Cluster) error {
 	selector := machinefilters.ControlPlaneSelectorForCluster(cluster.Name)
 	// Copy label selector to its status counterpart in string format.
diff --git a/controlplane/kubeadm/controllers/upgrade_test.go b/controlplane/kubeadm/controllers/upgrade_test.go
index 53596c317188..f0702e2a3da2 100644
--- a/controlplane/kubeadm/controllers/upgrade_test.go
+++ b/controlplane/kubeadm/controllers/upgrade_test.go
@@ -36,6 +36,8 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) {
 	g := NewWithT(t)
 
 	cluster, kcp, genericMachineTemplate := createClusterWithControlPlane()
+	cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
+	cluster.Spec.ControlPlaneEndpoint.Port = 6443
 	kcp.Spec.Version = "v1.17.3"
 	kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil
 	kcp.Spec.Replicas = pointer.Int32Ptr(1)
@@ -89,9 +91,9 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) {
 
 	// run upgrade a second time, simulate that the node has not appeared yet but the machine exists
 	r.managementCluster.(*fakeManagementCluster).ControlPlaneHealthy = false
-	result, err = r.upgradeControlPlane(context.Background(), cluster, kcp, controlPlane, needingUpgrade)
-	g.Expect(result).To(Equal(ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}))
-	g.Expect(err).To(BeNil())
+	// Unhealthy control plane will be detected during reconcile loop and upgrade will never be called.
+	_, err = r.reconcile(context.Background(), cluster, kcp)
+	g.Expect(err).To(HaveOccurred())
 	g.Expect(fakeClient.List(context.Background(), bothMachines, client.InNamespace(cluster.Namespace))).To(Succeed())
 	g.Expect(bothMachines.Items).To(HaveLen(2))
 
diff --git a/controlplane/kubeadm/internal/cluster.go b/controlplane/kubeadm/internal/cluster.go
index 0789c27e88f1..363e852411c5 100644
--- a/controlplane/kubeadm/internal/cluster.go
+++ b/controlplane/kubeadm/internal/cluster.go
@@ -28,13 +28,12 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	kerrors "k8s.io/apimachinery/pkg/util/errors"
 	"k8s.io/client-go/kubernetes/scheme"
-	"sigs.k8s.io/controller-runtime/pkg/client"
-	ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
-
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/controllers/remote"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters"
 	"sigs.k8s.io/cluster-api/util/secret"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
 )
 
 // ManagementCluster defines all behaviors necessary for something to function as a management cluster.
@@ -42,8 +41,8 @@ type ManagementCluster interface {
 	ctrlclient.Reader
 
 	GetMachinesForCluster(ctx context.Context, cluster client.ObjectKey, filters ...machinefilters.Func) (FilterableMachineCollection, error)
-	TargetClusterEtcdIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error
-	TargetClusterControlPlaneIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error
+	TargetClusterEtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error
+	TargetClusterControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error
 	GetWorkloadCluster(ctx context.Context, clusterKey client.ObjectKey) (WorkloadCluster, error)
 }
 
@@ -180,16 +179,22 @@ func (m *Management) getApiServerEtcdClientCert(ctx context.Context, clusterKey
 	return tls.X509KeyPair(crtData, keyData)
 }
 
-type healthCheck func(context.Context) (HealthCheckResult, error)
+type healthCheck func(context.Context, *ControlPlane) (HealthCheckResult, error)
 
 // HealthCheck will run a generic health check function and report any errors discovered.
 // In addition to the health check, it also ensures there is a 1;1 match between nodes and machines.
-func (m *Management) healthCheck(ctx context.Context, check healthCheck, clusterKey client.ObjectKey) error {
+// To have access to the owned control-plane machines during health checks, need to pass owningMachines here.
+func (m *Management) healthCheck(ctx context.Context, controlPlane *ControlPlane, check healthCheck, clusterKey client.ObjectKey) error {
 	var errorList []error
-	nodeChecks, err := check(ctx)
+	kcpMachines := controlPlane.Machines.UnsortedList()
+	// Make sure Cluster API is aware of all the nodes.
+
+	nodeChecks, err := check(ctx, controlPlane)
 	if err != nil {
 		errorList = append(errorList, err)
 	}
+
+	// TODO: If any node has a failure, healthCheck fails. This may be too strict for the health check of HA clusters (Missing a single etcd pod does not indicate a problem)...
 	for nodeName, err := range nodeChecks {
 		if err != nil {
 			errorList = append(errorList, fmt.Errorf("node %q: %v", nodeName, err))
@@ -199,44 +204,40 @@ func (m *Management) healthCheck(ctx context.Context, check healthCheck, cluster
 		return kerrors.NewAggregate(errorList)
 	}
 
-	// Make sure Cluster API is aware of all the nodes.
-	machines, err := m.GetMachinesForCluster(ctx, clusterKey, machinefilters.ControlPlaneMachines(clusterKey.Name))
-	if err != nil {
-		return err
-	}
-
 	// This check ensures there is a 1 to 1 correspondence of nodes and machines.
 	// If a machine was not checked this is considered an error.
-	for _, machine := range machines {
+	for _, machine := range kcpMachines {
 		if machine.Status.NodeRef == nil {
+			// The condition for this case is set by the Machine controller
 			return errors.Errorf("control plane machine %s/%s has no status.nodeRef", machine.Namespace, machine.Name)
 		}
 		if _, ok := nodeChecks[machine.Status.NodeRef.Name]; !ok {
 			return errors.Errorf("machine's (%s/%s) node (%s) was not checked", machine.Namespace, machine.Name, machine.Status.NodeRef.Name)
 		}
 	}
-	if len(nodeChecks) != len(machines) {
-		return errors.Errorf("number of nodes and machines in namespace %s did not match: %d nodes %d machines", clusterKey.Namespace, len(nodeChecks), len(machines))
+	if len(nodeChecks) != len(kcpMachines) {
+		// MachinesReadyCondition covers this health failure.
+		return errors.Errorf("number of nodes and machines in namespace %s did not match: %d nodes %d machines", clusterKey.Namespace, len(nodeChecks), len(kcpMachines))
 	}
 	return nil
 }
 
 // TargetClusterControlPlaneIsHealthy checks every node for control plane health.
-func (m *Management) TargetClusterControlPlaneIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error {
+func (m *Management) TargetClusterControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error {
 	// TODO: add checks for expected taints/labels
 	cluster, err := m.GetWorkloadCluster(ctx, clusterKey)
 	if err != nil {
 		return err
 	}
-	return m.healthCheck(ctx, cluster.ControlPlaneIsHealthy, clusterKey)
+	return m.healthCheck(ctx, controlPlane, cluster.ControlPlaneIsHealthy, clusterKey)
 }
 
 // TargetClusterEtcdIsHealthy runs a series of checks over a target cluster's etcd cluster.
 // In addition, it verifies that there are the same number of etcd members as control plane Machines.
-func (m *Management) TargetClusterEtcdIsHealthy(ctx context.Context, clusterKey client.ObjectKey) error {
+func (m *Management) TargetClusterEtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error {
 	cluster, err := m.GetWorkloadCluster(ctx, clusterKey)
 	if err != nil {
 		return err
 	}
-	return m.healthCheck(ctx, cluster.EtcdIsHealthy, clusterKey)
+	return m.healthCheck(ctx, controlPlane, cluster.EtcdIsHealthy, clusterKey)
 }
diff --git a/controlplane/kubeadm/internal/cluster_test.go b/controlplane/kubeadm/internal/cluster_test.go
index 4cc990e90e85..24f1845b90fa 100644
--- a/controlplane/kubeadm/internal/cluster_test.go
+++ b/controlplane/kubeadm/internal/cluster_test.go
@@ -25,6 +25,9 @@ import (
 	"errors"
 	"fmt"
 	"math/big"
+	"sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
+	"sigs.k8s.io/cluster-api/util/kubeconfig"
+	"sigs.k8s.io/cluster-api/util/secret"
 	"testing"
 	"time"
 
@@ -40,8 +43,6 @@ import (
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters"
 	"sigs.k8s.io/cluster-api/util/certs"
-	"sigs.k8s.io/cluster-api/util/kubeconfig"
-	"sigs.k8s.io/cluster-api/util/secret"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -118,8 +119,10 @@ func TestControlPlaneIsHealthy(t *testing.T) {
 			},
 		},
 	}
+	controlPlane, err := NewControlPlane(ctx, workloadCluster.Client, &clusterv1.Cluster{}, &v1alpha3.KubeadmControlPlane{}, nil)
+	g.Expect(err).NotTo(HaveOccurred())
 
-	health, err := workloadCluster.ControlPlaneIsHealthy(context.Background())
+	health, err := workloadCluster.ControlPlaneIsHealthy(context.Background(), controlPlane)
 	g.Expect(err).NotTo(HaveOccurred())
 	g.Expect(health).NotTo(HaveLen(0))
 	g.Expect(health).To(HaveLen(len(nodeListForTestControlPlaneIsHealthy().Items)))
@@ -156,7 +159,7 @@ func TestGetMachinesForCluster(t *testing.T) {
 func TestGetWorkloadCluster(t *testing.T) {
 	g := NewWithT(t)
 
-	ns, err := testEnv.CreateNamespace(ctx, "workload-cluster")
+	ns, err := testEnv.CreateNamespace(ctx, "workload-cluster2")
 	g.Expect(err).ToNot(HaveOccurred())
 	defer func() {
 		g.Expect(testEnv.Cleanup(ctx, ns)).To(Succeed())
@@ -459,67 +462,71 @@ func (f *fakeClient) Update(_ context.Context, _ runtime.Object, _ ...client.Upd
 	return nil
 }
 
+func createControlPlane(machines []*clusterv1.Machine) *ControlPlane {
+	controlPlane, _ := NewControlPlane(ctx, &fakeClient{}, &clusterv1.Cluster{}, &v1alpha3.KubeadmControlPlane{}, NewFilterableMachineCollection(machines...))
+	return controlPlane
+}
 func TestManagementCluster_healthCheck_NoError(t *testing.T) {
+	threeMachines := []*clusterv1.Machine{
+		controlPlaneMachine("one"),
+		controlPlaneMachine("two"),
+		controlPlaneMachine("three"),
+	}
+	controlPlane := createControlPlane(threeMachines)
 	tests := []struct {
 		name             string
-		machineList      *clusterv1.MachineList
 		check            healthCheck
 		clusterKey       client.ObjectKey
 		controlPlaneName string
+		controlPlane     *ControlPlane
 	}{
 		{
 			name: "simple",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-					controlPlaneMachine("two"),
-					controlPlaneMachine("three"),
-				},
-			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
+			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
 				return HealthCheckResult{
 					"one":   nil,
 					"two":   nil,
 					"three": nil,
 				}, nil
 			},
-			clusterKey: client.ObjectKey{Namespace: "default", Name: "cluster-name"},
+			clusterKey:   client.ObjectKey{Namespace: "default", Name: "cluster-name"},
+			controlPlane: controlPlane,
 		},
 	}
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			g := NewWithT(t)
 
 			ctx := context.Background()
 			m := &Management{
-				Client: &fakeClient{list: tt.machineList},
+				Client: &fakeClient{},
 			}
-			g.Expect(m.healthCheck(ctx, tt.check, tt.clusterKey)).To(Succeed())
+			g.Expect(m.healthCheck(ctx, controlPlane, tt.check, tt.clusterKey)).To(Succeed())
 		})
 	}
 }
 
 func TestManagementCluster_healthCheck_Errors(t *testing.T) {
+
 	tests := []struct {
 		name             string
-		machineList      *clusterv1.MachineList
 		check            healthCheck
 		clusterKey       client.ObjectKey
 		controlPlaneName string
+		controlPlane     *ControlPlane
 		// expected errors will ensure the error contains this list of strings.
 		// If not supplied, no check on the error's value will occur.
 		expectedErrors []string
 	}{
 		{
 			name: "machine's node was not checked for health",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-					controlPlaneMachine("two"),
-					controlPlaneMachine("three"),
-				},
-			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
+			controlPlane: createControlPlane([]*clusterv1.Machine{
+				controlPlaneMachine("one"),
+				controlPlaneMachine("two"),
+				controlPlaneMachine("three"),
+			}),
+			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
 				return HealthCheckResult{
 					"one": nil,
 				}, nil
@@ -527,14 +534,12 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 		},
 		{
 			name: "health check returns an error not related to the nodes health",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-					controlPlaneMachine("two"),
-					controlPlaneMachine("three"),
-				},
-			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
+			controlPlane: createControlPlane([]*clusterv1.Machine{
+				controlPlaneMachine("one"),
+				controlPlaneMachine("two"),
+				controlPlaneMachine("three"),
+			}),
+			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
 				return HealthCheckResult{
 					"one":   nil,
 					"two":   errors.New("two"),
@@ -545,14 +550,11 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 		},
 		{
 			name: "two nodes error on the check but no overall error occurred",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-					controlPlaneMachine("two"),
-					controlPlaneMachine("three"),
-				},
-			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
+			controlPlane: createControlPlane([]*clusterv1.Machine{
+				controlPlaneMachine("one"),
+				controlPlaneMachine("two"),
+				controlPlaneMachine("three")}),
+			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
 				return HealthCheckResult{
 					"one":   nil,
 					"two":   errors.New("two"),
@@ -563,12 +565,9 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 		},
 		{
 			name: "more nodes than machines were checked (out of band control plane nodes)",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-				},
-			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
+			controlPlane: createControlPlane([]*clusterv1.Machine{
+				controlPlaneMachine("one")}),
+			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
 				return HealthCheckResult{
 					"one":   nil,
 					"two":   nil,
@@ -578,14 +577,11 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 		},
 		{
 			name: "a machine that has a nil node reference",
-			machineList: &clusterv1.MachineList{
-				Items: []clusterv1.Machine{
-					controlPlaneMachine("one"),
-					controlPlaneMachine("two"),
-					nilNodeRef(controlPlaneMachine("three")),
-				},
-			},
-			check: func(ctx context.Context) (HealthCheckResult, error) {
+			controlPlane: createControlPlane([]*clusterv1.Machine{
+				controlPlaneMachine("one"),
+				controlPlaneMachine("two"),
+				nilNodeRef(controlPlaneMachine("three"))}),
+			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
 				return HealthCheckResult{
 					"one":   nil,
 					"two":   nil,
@@ -601,10 +597,8 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 			ctx := context.Background()
 			clusterKey := client.ObjectKey{Namespace: "default", Name: "cluster-name"}
 
-			m := &Management{
-				Client: &fakeClient{list: tt.machineList},
-			}
-			err := m.healthCheck(ctx, tt.check, clusterKey)
+			m := &Management{Client: &fakeClient{}}
+			err := m.healthCheck(ctx, tt.controlPlane, tt.check, clusterKey)
 			g.Expect(err).To(HaveOccurred())
 
 			for _, expectedError := range tt.expectedErrors {
@@ -614,9 +608,9 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 	}
 }
 
-func controlPlaneMachine(name string) clusterv1.Machine {
+func controlPlaneMachine(name string) *clusterv1.Machine {
 	t := true
-	return clusterv1.Machine{
+	return &clusterv1.Machine{
 		ObjectMeta: metav1.ObjectMeta{
 			Namespace: "default",
 			Name:      name,
@@ -637,7 +631,7 @@ func controlPlaneMachine(name string) clusterv1.Machine {
 	}
 }
 
-func nilNodeRef(machine clusterv1.Machine) clusterv1.Machine {
+func nilNodeRef(machine *clusterv1.Machine) *clusterv1.Machine {
 	machine.Status.NodeRef = nil
 	return machine
 }
diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go
index 0c2c02dffd56..43f93a315b07 100644
--- a/controlplane/kubeadm/internal/control_plane.go
+++ b/controlplane/kubeadm/internal/control_plane.go
@@ -38,10 +38,10 @@ import (
 // It should never need to connect to a service, that responsibility lies outside of this struct.
 // Going forward we should be trying to add more logic to here and reduce the amount of logic in the reconciler.
 type ControlPlane struct {
-	KCP      *controlplanev1.KubeadmControlPlane
-	Cluster  *clusterv1.Cluster
-	Machines FilterableMachineCollection
-
+	KCP               *controlplanev1.KubeadmControlPlane
+	Cluster           *clusterv1.Cluster
+	Machines          FilterableMachineCollection
+	MachineConditions map[string]map[clusterv1.ConditionType]*clusterv1.Condition
 	// reconciliationTime is the time of the current reconciliation, and should be used for all "now" calculations
 	reconciliationTime metav1.Time
 
@@ -57,16 +57,24 @@ func NewControlPlane(ctx context.Context, client client.Client, cluster *cluster
 	if err != nil {
 		return nil, err
 	}
+
 	kubeadmConfigs, err := getKubeadmConfigs(ctx, client, ownedMachines)
 	if err != nil {
 		return nil, err
 	}
+
+	machineConditions := make(map[string]map[clusterv1.ConditionType]*clusterv1.Condition)
+	for machineName := range ownedMachines {
+		machineConditions[machineName] = make(map[clusterv1.ConditionType]*clusterv1.Condition)
+	}
+
 	return &ControlPlane{
 		KCP:                kcp,
 		Cluster:            cluster,
 		Machines:           ownedMachines,
 		kubeadmConfigs:     kubeadmConfigs,
 		infraResources:     infraObjects,
+		MachineConditions:  machineConditions,
 		reconciliationTime: metav1.Now(),
 	}, nil
 }
diff --git a/controlplane/kubeadm/internal/etcd_client_generator.go b/controlplane/kubeadm/internal/etcd_client_generator.go
index e709a3bae48d..62eb77a7791b 100644
--- a/controlplane/kubeadm/internal/etcd_client_generator.go
+++ b/controlplane/kubeadm/internal/etcd_client_generator.go
@@ -19,7 +19,6 @@ package internal
 import (
 	"context"
 	"crypto/tls"
-
 	"github.com/pkg/errors"
 	kerrors "k8s.io/apimachinery/pkg/util/errors"
 
@@ -39,7 +38,7 @@ type etcdClientGenerator struct {
 func (c *etcdClientGenerator) forNodes(ctx context.Context, nodes []corev1.Node) (*etcd.Client, error) {
 	endpoints := make([]string, len(nodes))
 	for i, node := range nodes {
-		endpoints[i] = staticPodName("etcd", node.Name)
+		endpoints[i] = staticPodName(EtcdPodNamePrefix, node.Name)
 	}
 
 	p := proxy.Proxy{
diff --git a/controlplane/kubeadm/internal/machine_collection.go b/controlplane/kubeadm/internal/machine_collection.go
index 425cea20344a..975aea467218 100644
--- a/controlplane/kubeadm/internal/machine_collection.go
+++ b/controlplane/kubeadm/internal/machine_collection.go
@@ -83,8 +83,8 @@ func (s FilterableMachineCollection) SortedByCreationTimestamp() []*clusterv1.Ma
 	return res
 }
 
-// unsortedList returns the slice with contents in random order.
-func (s FilterableMachineCollection) unsortedList() []*clusterv1.Machine {
+// UnsortedList returns the slice with contents in random order.
+func (s FilterableMachineCollection) UnsortedList() []*clusterv1.Machine {
 	res := make([]*clusterv1.Machine, 0, len(s))
 	for _, value := range s {
 		res = append(res, value)
@@ -110,12 +110,12 @@ func newFilteredMachineCollection(filter machinefilters.Func, machines ...*clust
 
 // Filter returns a FilterableMachineCollection containing only the Machines that match all of the given MachineFilters
 func (s FilterableMachineCollection) Filter(filters ...machinefilters.Func) FilterableMachineCollection {
-	return newFilteredMachineCollection(machinefilters.And(filters...), s.unsortedList()...)
+	return newFilteredMachineCollection(machinefilters.And(filters...), s.UnsortedList()...)
 }
 
 // AnyFilter returns a FilterableMachineCollection containing only the Machines that match any of the given MachineFilters
 func (s FilterableMachineCollection) AnyFilter(filters ...machinefilters.Func) FilterableMachineCollection {
-	return newFilteredMachineCollection(machinefilters.Or(filters...), s.unsortedList()...)
+	return newFilteredMachineCollection(machinefilters.Or(filters...), s.UnsortedList()...)
 }
 
 // Oldest returns the Machine with the oldest CreationTimestamp
diff --git a/controlplane/kubeadm/internal/workload_cluster.go b/controlplane/kubeadm/internal/workload_cluster.go
index d0db1fe684a9..4a642c281c78 100644
--- a/controlplane/kubeadm/internal/workload_cluster.go
+++ b/controlplane/kubeadm/internal/workload_cluster.go
@@ -26,6 +26,7 @@ import (
 	"crypto/x509/pkix"
 	"fmt"
 	"math/big"
+	"sigs.k8s.io/cluster-api/util/conditions"
 	"time"
 
 	"github.com/blang/semver"
@@ -53,12 +54,40 @@ var (
 	ErrControlPlaneMinNodes = errors.New("cluster has fewer than 2 control plane nodes; removing an etcd member is not supported")
 )
 
+// Common control-plane pod name prefixes
+const (
+	KubeAPIServerPodNamePrefix         = "kube-apiserver"
+	KubeControllerManagerPodNamePrefix = "kube-controller-manager"
+	KubeSchedulerHealthyPodNamePrefix  = "kube-scheduler"
+	EtcdPodNamePrefix                  = "etcd"
+)
+
+type ConditionReason string
+
+// GetSeverity returns the default severity for the condition reasons for the conditions owned by KCP.
+func (c ConditionReason) GetSeverity() clusterv1.ConditionSeverity {
+	switch {
+	case c == ConditionReason(clusterv1.PodProvisioningReason):
+		return clusterv1.ConditionSeverityInfo
+	case c == ConditionReason(clusterv1.PodMissingReason):
+		return clusterv1.ConditionSeverityWarning
+	case c == ConditionReason(clusterv1.PodFailedReason):
+		return clusterv1.ConditionSeverityError
+	case c == ConditionReason(clusterv1.EtcdMemberUnhealthyReason):
+		return clusterv1.ConditionSeverityError
+	case c == ConditionReason(controlplanev1.EtcdClusterUnhealthyReason):
+		return clusterv1.ConditionSeverityWarning
+	default:
+		return clusterv1.ConditionSeverityInfo
+	}
+}
+
 // WorkloadCluster defines all behaviors necessary to upgrade kubernetes on a workload cluster
 type WorkloadCluster interface {
 	// Basic health and status checks.
 	ClusterStatus(ctx context.Context) (ClusterStatus, error)
-	ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult, error)
-	EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
+	ControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error)
+	EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error)
 
 	// Upgrade related tasks.
 	ReconcileKubeletRBACBinding(ctx context.Context, version semver.Version) error
@@ -112,7 +141,7 @@ type HealthCheckResult map[string]error
 // controlPlaneIsHealthy does a best effort check of the control plane components the kubeadm control plane cares about.
 // The return map is a map of node names as keys to error that that node encountered.
 // All nodes will exist in the map with nil errors if there were no errors for that node.
-func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult, error) {
+func (w *Workload) ControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
 	controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
 	if err != nil {
 		return nil, err
@@ -123,37 +152,110 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context) (HealthCheckResult
 		name := node.Name
 		response[name] = nil
 
+		var owningMachine *clusterv1.Machine = nil
+		// If there is no owning Machine for the node, setting conditions are no-op.
+		for _, machine := range controlPlane.Machines {
+			if machine.Status.NodeRef == nil {
+				continue
+			}
+			if machine.Status.NodeRef.Name == node.Name {
+				owningMachine = machine
+			}
+		}
+
+		// If there is any problem with the checks below, set response to any one of the problems.
+		var anyError error = nil
 		if err := checkNodeNoExecuteCondition(node); err != nil {
-			response[name] = err
-			continue
+			anyError = err
 		}
 
-		apiServerPodKey := ctrlclient.ObjectKey{
-			Namespace: metav1.NamespaceSystem,
-			Name:      staticPodName("kube-apiserver", name),
+		// Check kube-api-server health
+		if _, err = w.checkPodStatusAndUpdateCondition(controlPlane, KubeAPIServerPodNamePrefix, node, owningMachine, clusterv1.MachineAPIServerPodHealthyCondition); err != nil {
+			anyError = err
 		}
-		apiServerPod := corev1.Pod{}
-		if err := w.Client.Get(ctx, apiServerPodKey, &apiServerPod); err != nil {
-			response[name] = err
-			continue
+
+		// Check kube-controller-manager health
+		if _, err = w.checkPodStatusAndUpdateCondition(controlPlane, KubeControllerManagerPodNamePrefix, node, owningMachine, clusterv1.MachineControllerManagerHealthyCondition); err != nil {
+			anyError = err
 		}
-		response[name] = checkStaticPodReadyCondition(apiServerPod)
 
-		controllerManagerPodKey := ctrlclient.ObjectKey{
-			Namespace: metav1.NamespaceSystem,
-			Name:      staticPodName("kube-controller-manager", name),
+		// Check kube-scheduler health
+		if _, err = w.checkPodStatusAndUpdateCondition(controlPlane, KubeSchedulerHealthyPodNamePrefix, node, owningMachine, clusterv1.MachineSchedulerPodHealthyCondition); err != nil {
+			anyError = err
 		}
-		controllerManagerPod := corev1.Pod{}
-		if err := w.Client.Get(ctx, controllerManagerPodKey, &controllerManagerPod); err != nil {
+
+		if anyError != nil {
 			response[name] = err
 			continue
 		}
-		response[name] = checkStaticPodReadyCondition(controllerManagerPod)
 	}
 
 	return response, nil
 }
 
+// checkPodStatusAndUpdateConditions returns true, only if the Pod is in Running/Ready condition.
+// There is a slight difference between returning false and error:
+// checkPodStatusAndUpdateConditions returns false, if Pod is not in Ready Condition or there is a client error.
+// checkPodStatusAndUpdateConditions returns error only when it is obvious that the Pod is not in Running/Ready state, so it does not return error on transient client errors.
+// For instance, returning (false, nil) means Pod is not ready but is not in error state as well,
+// returning (false, err) means Pod is not ready and it is in error state as well.
+func (w *Workload) checkPodStatusAndUpdateCondition(controlPlane *ControlPlane, staticPodPrefix string, node corev1.Node, owningMachine *clusterv1.Machine, conditionType clusterv1.ConditionType) (bool, error) {
+	staticPodKey := ctrlclient.ObjectKey{
+		Namespace: metav1.NamespaceSystem,
+		Name:      staticPodName(staticPodPrefix, node.Name),
+	}
+
+	staticPod := corev1.Pod{}
+	if err := w.Client.Get(context.TODO(), staticPodKey, &staticPod); err != nil {
+		// If there is an error getting the Pod, do not set any conditions.
+		if apierrors.IsNotFound(err) {
+			if owningMachine != nil {
+				if _, ok := controlPlane.MachineConditions[owningMachine.Name]; ok {
+					controlPlane.MachineConditions[owningMachine.Name][conditionType] = conditions.FalseCondition(conditionType, clusterv1.PodMissingReason, ConditionReason(clusterv1.PodMissingReason).GetSeverity(), "")
+				}
+			}
+			return false, errors.Errorf("static pod %s is missing", staticPodPrefix)
+		}
+		// Do not return error here, because this may be a transient error.
+		return false, nil
+	}
+
+	if err := checkStaticPodReadyCondition(staticPod); err != nil {
+		if checkStaticPodFailedPhase(staticPod) {
+			if owningMachine != nil {
+				controlPlane.MachineConditions[owningMachine.Name][conditionType] = conditions.FalseCondition(conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "")
+			}
+			return false, errors.Errorf("static pod %s is failed", staticPodPrefix)
+		}
+
+		// Check if the Pod is in Pending state
+		if checkStaticPodProvisioning(staticPod) {
+			if owningMachine != nil {
+				controlPlane.MachineConditions[owningMachine.Name][conditionType] = conditions.FalseCondition(conditionType, clusterv1.PodProvisioningReason, ConditionReason(clusterv1.PodProvisioningReason).GetSeverity(), "")
+			}
+			// This is not an error case.
+			return false, nil
+		}
+
+		// If Pod is not provisioning state, check if still in Pending Phase.
+		podProvisioningState := checkStaticPodAfterProvisioningState(staticPod)
+		// Non-nil podProvisioningState means there is at least one container in waiting state.
+		if podProvisioningState != "" {
+			if owningMachine != nil {
+				controlPlane.MachineConditions[owningMachine.Name][conditionType] = conditions.FalseCondition(conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "Pod is provisioned but not ready.")
+			}
+			return false, errors.Errorf("static pod %s is provisioned but still is not ready", staticPodPrefix)
+		}
+
+		return false, errors.Errorf("static pod %s is not ready", staticPodPrefix)
+	}
+
+	if owningMachine != nil {
+		controlPlane.MachineConditions[owningMachine.Name][conditionType] = conditions.TrueCondition(conditionType)
+	}
+	return true, nil
+}
+
 // UpdateKubernetesVersionInKubeadmConfigMap updates the kubernetes version in the kubeadm config map.
 func (w *Workload) UpdateImageRepositoryInKubeadmConfigMap(ctx context.Context, imageRepository string) error {
 	configMapKey := ctrlclient.ObjectKey{Name: "kubeadm-config", Namespace: metav1.NamespaceSystem}
@@ -358,12 +460,53 @@ func checkStaticPodReadyCondition(pod corev1.Pod) error {
 			return errors.Errorf("static pod %s/%s is not ready", pod.Namespace, pod.Name)
 		}
 	}
+
+	// TODO: Remove this as static pods have ready condition.
 	if !found {
 		return errors.Errorf("pod does not have ready condition: %v", pod.Name)
 	}
 	return nil
 }
 
+func checkStaticPodFailedPhase(pod corev1.Pod) bool {
+	return pod.Status.Phase == corev1.PodFailed
+}
+
+// If Pod is in Pending phase and PodScheduled or Initialized condition is set to false, then pod is in provisioning state.
+func checkStaticPodProvisioning(pod corev1.Pod) bool {
+	// Pod being not in Pending phase means it has already provisioned and running or failed or in unknown phase.
+	if pod.Status.Phase != corev1.PodPending {
+		return false
+	}
+
+	for _, condition := range pod.Status.Conditions {
+		if condition.Type == corev1.PodScheduled && condition.Status != corev1.ConditionTrue {
+			return true
+		}
+		if condition.Type == corev1.PodInitialized && condition.Status != corev1.ConditionTrue {
+			return true
+		}
+	}
+	return false
+}
+
+// If Pod is in Pending phase, there may be some failing containers in the pod.
+// This function must be called when PodScheduled and Initialized condition is set to true. If a Pod is initialized but still in Pending state,
+// returns non-nil string with the reason of the container in waiting state.
+func checkStaticPodAfterProvisioningState(pod corev1.Pod) string {
+	// Pod being not in Pending phase means it has already provisioned and running or failed or in unknown phase.
+	if pod.Status.Phase != corev1.PodPending {
+		return ""
+	}
+
+	for _, containerStatus := range pod.Status.ContainerStatuses {
+		if containerStatus.State.Waiting != nil {
+			return containerStatus.State.Waiting.Reason
+		}
+	}
+	return ""
+}
+
 func checkNodeNoExecuteCondition(node corev1.Node) error {
 	for _, taint := range node.Spec.Taints {
 		if taint.Key == corev1.TaintNodeUnreachable && taint.Effect == corev1.TaintEffectNoExecute {
diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go
index 78261db27911..0d4f7595d1bb 100644
--- a/controlplane/kubeadm/internal/workload_cluster_etcd.go
+++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go
@@ -18,14 +18,15 @@ package internal
 
 import (
 	"context"
-
 	"github.com/pkg/errors"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	kerrors "k8s.io/apimachinery/pkg/util/errors"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
+	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
 	etcdutil "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd/util"
+	"sigs.k8s.io/cluster-api/util/conditions"
 	ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -38,7 +39,7 @@ type etcdClientFor interface {
 // This is a best effort check and nodes can become unhealthy after the check is complete. It is not a guarantee.
 // It's used a signal for if we should allow a target cluster to scale up, scale down or upgrade.
 // It returns a map of nodes checked along with an error for a given node.
-func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error) {
+func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
 	var knownClusterID uint64
 	var knownMemberIDSet etcdutil.UInt64Set
 
@@ -49,6 +50,12 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
 
 	expectedMembers := 0
 	response := make(map[string]error)
+	var owningMachine *clusterv1.Machine
+	owningMachine = nil
+
+	// Initial assumtion is that etcd cluster is healthy. If otherwise is observed below, it is set to false.
+	conditions.Set(controlPlane.KCP, conditions.TrueCondition(controlplanev1.EtcdClusterHealthy))
+
 	for _, node := range controlPlaneNodes.Items {
 		name := node.Name
 		response[name] = nil
@@ -57,19 +64,24 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
 			continue
 		}
 
-		// Check to see if the pod is ready
-		etcdPodKey := ctrlclient.ObjectKey{
-			Namespace: metav1.NamespaceSystem,
-			Name:      staticPodName("etcd", name),
-		}
-		pod := corev1.Pod{}
-		if err := w.Client.Get(ctx, etcdPodKey, &pod); err != nil {
-			response[name] = errors.Wrap(err, "failed to get etcd pod")
-			continue
+		for _, m := range controlPlane.Machines {
+			if m.Spec.ProviderID != nil && *m.Spec.ProviderID == node.Spec.ProviderID {
+				owningMachine = m
+				// Only set this condition if the node has an owning machine.
+				controlPlane.MachineConditions[owningMachine.Name][clusterv1.MachineEtcdMemberHealthyCondition] = conditions.TrueCondition(clusterv1.MachineEtcdMemberHealthyCondition)
+				break
+			}
 		}
-		if err := checkStaticPodReadyCondition(pod); err != nil {
+
+		// TODO: If owning machine is nil, should not continue. But this change breaks the logic below.
+
+		// Check etcd pod's health
+		isEtcdPodHealthy, _ := w.checkPodStatusAndUpdateCondition(controlPlane, EtcdPodNamePrefix, node, owningMachine, clusterv1.MachineEtcdPodHealthyCondition)
+		// isEtcdPodHealthy is false, if Pod is not Ready or there is a client error.
+		if !isEtcdPodHealthy {
 			// Nothing wrong here, etcd on this node is just not running.
 			// If it's a true failure the healthcheck will fail since it won't have checked enough members.
+			response[name] = errors.Wrap(err, "etcd pod is not ready")
 			continue
 		}
 		// Only expect a member reports healthy if its pod is ready.
@@ -80,6 +92,9 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
 		// Create the etcd Client for the etcd Pod scheduled on the Node
 		etcdClient, err := w.etcdClientGenerator.forNodes(ctx, []corev1.Node{node})
 		if err != nil {
+			if owningMachine != nil {
+				controlPlane.MachineConditions[owningMachine.Name][clusterv1.MachineEtcdMemberHealthyCondition] = conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure.")
+			}
 			response[name] = errors.Wrap(err, "failed to create etcd client")
 			continue
 		}
@@ -88,6 +103,9 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
 		// List etcd members. This checks that the member is healthy, because the request goes through consensus.
 		members, err := etcdClient.Members(ctx)
 		if err != nil {
+			if owningMachine != nil {
+				controlPlane.MachineConditions[owningMachine.Name][clusterv1.MachineEtcdMemberHealthyCondition] = conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure.")
+			}
 			response[name] = errors.Wrap(err, "failed to list etcd members using etcd client")
 			continue
 		}
@@ -96,6 +114,9 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
 
 		// Check that the member reports no alarms.
 		if len(member.Alarms) > 0 {
+			if owningMachine != nil {
+				controlPlane.MachineConditions[owningMachine.Name][clusterv1.MachineEtcdMemberHealthyCondition] = conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd member has alarms.")
+			}
 			response[name] = errors.Errorf("etcd member reports alarms: %v", member.Alarms)
 			continue
 		}
@@ -116,20 +137,35 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error)
 		} else {
 			unknownMembers := memberIDSet.Difference(knownMemberIDSet)
 			if unknownMembers.Len() > 0 {
+				conditions.Set(controlPlane.KCP, conditions.FalseCondition(controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd members do not have the same member-list view"))
 				response[name] = errors.Errorf("etcd member reports members IDs %v, but all previously seen etcd members reported member IDs %v", memberIDSet.UnsortedList(), knownMemberIDSet.UnsortedList())
 			}
 			continue
 		}
 	}
 
+	// Check etcd cluster alarms
+	etcdClient, err := w.etcdClientGenerator.forNodes(ctx, controlPlaneNodes.Items)
+	if err == nil {
+		alarmList, err := etcdClient.Alarms(ctx)
+		if len(alarmList) > 0 || err != nil {
+			conditions.Set(controlPlane.KCP, conditions.FalseCondition(controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd cluster has alarms."))
+		}
+	}
+	defer etcdClient.Close()
+
 	// TODO: ensure that each pod is owned by a node that we're managing. That would ensure there are no out-of-band etcd members
 
 	// Check that there is exactly one etcd member for every healthy pod.
 	// This allows us to handle the expected case where there is a failing pod but it's been removed from the member list.
 	if expectedMembers != len(knownMemberIDSet) {
+		conditions.Set(controlPlane.KCP, conditions.FalseCondition(controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd pods does not match with etcd members."))
 		return response, errors.Errorf("there are %d healthy etcd pods, but %d etcd members", expectedMembers, len(knownMemberIDSet))
 	}
 
+	if len(knownMemberIDSet) < (controlPlane.Machines.Len()/2.0 + 1) {
+		conditions.Set(controlPlane.KCP, conditions.FalseCondition(controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd cluster's quorum is lost."))
+	}
 	return response, nil
 }
 
diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go
index d4384d4bbec7..e997135494bf 100644
--- a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go
+++ b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go
@@ -39,6 +39,12 @@ import (
 func TestWorkload_EtcdIsHealthy(t *testing.T) {
 	g := NewWithT(t)
 
+	controlPlane := createControlPlane([]*clusterv1.Machine{
+		controlPlaneMachine("one"),
+		controlPlaneMachine("two"),
+		controlPlaneMachine("three"),
+	})
+
 	workload := &Workload{
 		Client: &fakeClient{
 			get: map[string]interface{}{
@@ -75,7 +81,7 @@ func TestWorkload_EtcdIsHealthy(t *testing.T) {
 		},
 	}
 	ctx := context.Background()
-	health, err := workload.EtcdIsHealthy(ctx)
+	health, err := workload.EtcdIsHealthy(ctx, controlPlane)
 	g.Expect(err).NotTo(HaveOccurred())
 
 	for _, err := range health {

From 3836c76c95a9b9dbd188ae066754bf2c4f48c3f3 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Mon, 28 Sep 2020 00:55:03 -0700
Subject: [PATCH 02/12] Set conditions directly on the controlPlane machines.

---
 .../kubeadm/controllers/controller.go         | 56 ++++---------------
 controlplane/kubeadm/internal/cluster_test.go |  1 +
 .../kubeadm/internal/control_plane.go         | 13 +----
 .../kubeadm/internal/workload_cluster.go      | 20 +++----
 .../kubeadm/internal/workload_cluster_etcd.go | 11 ++--
 5 files changed, 29 insertions(+), 72 deletions(-)

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index 081853892fee..1eb53193484b 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -212,27 +212,6 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(req ctrl.Request) (res ctrl.Re
 	return r.reconcile(ctx, cluster, kcp)
 }
 
-// setSingleMachineConditions updates the machine's conditions according to health tracker.
-func setSingleMachineConditions(machine *clusterv1.Machine, controlPlane *internal.ControlPlane) {
-	for condType, condition := range controlPlane.MachineConditions[machine.Name] {
-		doesConditionExist := false
-		for _, mCondition := range machine.Status.Conditions {
-			// If the condition already exists, change the condition.
-			if mCondition.Type == condType {
-				conditions.Set(machine, condition)
-				doesConditionExist = true
-			}
-		}
-		if !doesConditionExist {
-			if machine.Status.Conditions == nil {
-				machine.Status.Conditions = clusterv1.Conditions{}
-			}
-			conditions.Set(machine, condition)
-		}
-
-	}
-}
-
 func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kcp *controlplanev1.KubeadmControlPlane) error {
 	// Always update the readyCondition by summarizing the state of other conditions.
 	conditions.SetSummary(kcp,
@@ -327,12 +306,10 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 	// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
 	conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef())
 
-	// If control plane is initialized, reconcile health.
-	if ownedMachines.Len() != 0 {
-		// reconcileControlPlaneHealth returns err if there is a machine being delete
-		if result, err := r.reconcileControlPlaneHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
-			return result, err
-		}
+	// reconcileControlPlaneHealth returns err if there is a machine being delete or control plane is unhealthy.
+	// If control plane is not initialized, then control-plane machines will be empty and hence health check will not fail.
+	if result, err := r.reconcileControlPlaneHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
+		return result, err
 	}
 
 	// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
@@ -474,21 +451,7 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M
 
 func patchControlPlaneMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine) error {
 	// Patch the object, ignoring conflicts on the conditions owned by this controller.
-
-	// TODO: Is it okay to own these conditions or just patch?
-	// return patchHelper.Patch(ctx, machine)
-
-	return patchHelper.Patch(
-		ctx,
-		machine,
-		patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
-			clusterv1.MachineAPIServerPodHealthyCondition,
-			clusterv1.MachineControllerManagerHealthyCondition,
-			clusterv1.MachineEtcdMemberHealthyCondition,
-			clusterv1.MachineEtcdPodHealthyCondition,
-			clusterv1.MachineSchedulerPodHealthyCondition,
-		}},
-	)
+	return patchHelper.Patch(ctx, machine)
 }
 
 // reconcileControlPlaneHealth performs health checks for control plane components and etcd
@@ -497,17 +460,20 @@ func patchControlPlaneMachine(ctx context.Context, patchHelper *patch.Helper, ma
 func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
 	logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name)
 
+	// If there is no KCP-owned control-plane machines, then control-plane has not been initialized yet.
+	if controlPlane.Machines.Len() == 0 {
+		return ctrl.Result{}, nil
+	}
+
 	for _, m := range controlPlane.Machines {
 		// Initialize the patch helper.
 		patchHelper, err := patch.NewHelper(m, r.Client)
 		if err != nil {
-			logger.Error(err, "Failed to configure the patch helper")
-			return ctrl.Result{Requeue: true}, nil
+			return ctrl.Result{}, err
 		}
 
 		machine := m
 		defer func() {
-			setSingleMachineConditions(machine, controlPlane)
 			// Always attempt to Patch the Machine conditions after each health reconciliation.
 			if err := patchControlPlaneMachine(ctx, patchHelper, machine); err != nil {
 				logger.Error(err, "Failed to patch KubeadmControlPlane Machine")
diff --git a/controlplane/kubeadm/internal/cluster_test.go b/controlplane/kubeadm/internal/cluster_test.go
index 24f1845b90fa..f781249a614c 100644
--- a/controlplane/kubeadm/internal/cluster_test.go
+++ b/controlplane/kubeadm/internal/cluster_test.go
@@ -466,6 +466,7 @@ func createControlPlane(machines []*clusterv1.Machine) *ControlPlane {
 	controlPlane, _ := NewControlPlane(ctx, &fakeClient{}, &clusterv1.Cluster{}, &v1alpha3.KubeadmControlPlane{}, NewFilterableMachineCollection(machines...))
 	return controlPlane
 }
+
 func TestManagementCluster_healthCheck_NoError(t *testing.T) {
 	threeMachines := []*clusterv1.Machine{
 		controlPlaneMachine("one"),
diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go
index 43f93a315b07..6ce64c0f8f27 100644
--- a/controlplane/kubeadm/internal/control_plane.go
+++ b/controlplane/kubeadm/internal/control_plane.go
@@ -38,10 +38,9 @@ import (
 // It should never need to connect to a service, that responsibility lies outside of this struct.
 // Going forward we should be trying to add more logic to here and reduce the amount of logic in the reconciler.
 type ControlPlane struct {
-	KCP               *controlplanev1.KubeadmControlPlane
-	Cluster           *clusterv1.Cluster
-	Machines          FilterableMachineCollection
-	MachineConditions map[string]map[clusterv1.ConditionType]*clusterv1.Condition
+	KCP      *controlplanev1.KubeadmControlPlane
+	Cluster  *clusterv1.Cluster
+	Machines FilterableMachineCollection
 	// reconciliationTime is the time of the current reconciliation, and should be used for all "now" calculations
 	reconciliationTime metav1.Time
 
@@ -63,18 +62,12 @@ func NewControlPlane(ctx context.Context, client client.Client, cluster *cluster
 		return nil, err
 	}
 
-	machineConditions := make(map[string]map[clusterv1.ConditionType]*clusterv1.Condition)
-	for machineName := range ownedMachines {
-		machineConditions[machineName] = make(map[clusterv1.ConditionType]*clusterv1.Condition)
-	}
-
 	return &ControlPlane{
 		KCP:                kcp,
 		Cluster:            cluster,
 		Machines:           ownedMachines,
 		kubeadmConfigs:     kubeadmConfigs,
 		infraResources:     infraObjects,
-		MachineConditions:  machineConditions,
 		reconciliationTime: metav1.Now(),
 	}, nil
 }
diff --git a/controlplane/kubeadm/internal/workload_cluster.go b/controlplane/kubeadm/internal/workload_cluster.go
index 4a642c281c78..4b8e15a0ab26 100644
--- a/controlplane/kubeadm/internal/workload_cluster.go
+++ b/controlplane/kubeadm/internal/workload_cluster.go
@@ -170,17 +170,17 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context, controlPlane *Cont
 		}
 
 		// Check kube-api-server health
-		if _, err = w.checkPodStatusAndUpdateCondition(controlPlane, KubeAPIServerPodNamePrefix, node, owningMachine, clusterv1.MachineAPIServerPodHealthyCondition); err != nil {
+		if _, err = w.checkPodStatusAndUpdateCondition(KubeAPIServerPodNamePrefix, node, owningMachine, clusterv1.MachineAPIServerPodHealthyCondition); err != nil {
 			anyError = err
 		}
 
 		// Check kube-controller-manager health
-		if _, err = w.checkPodStatusAndUpdateCondition(controlPlane, KubeControllerManagerPodNamePrefix, node, owningMachine, clusterv1.MachineControllerManagerHealthyCondition); err != nil {
+		if _, err = w.checkPodStatusAndUpdateCondition(KubeControllerManagerPodNamePrefix, node, owningMachine, clusterv1.MachineControllerManagerHealthyCondition); err != nil {
 			anyError = err
 		}
 
 		// Check kube-scheduler health
-		if _, err = w.checkPodStatusAndUpdateCondition(controlPlane, KubeSchedulerHealthyPodNamePrefix, node, owningMachine, clusterv1.MachineSchedulerPodHealthyCondition); err != nil {
+		if _, err = w.checkPodStatusAndUpdateCondition(KubeSchedulerHealthyPodNamePrefix, node, owningMachine, clusterv1.MachineSchedulerPodHealthyCondition); err != nil {
 			anyError = err
 		}
 
@@ -199,7 +199,7 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context, controlPlane *Cont
 // checkPodStatusAndUpdateConditions returns error only when it is obvious that the Pod is not in Running/Ready state, so it does not return error on transient client errors.
 // For instance, returning (false, nil) means Pod is not ready but is not in error state as well,
 // returning (false, err) means Pod is not ready and it is in error state as well.
-func (w *Workload) checkPodStatusAndUpdateCondition(controlPlane *ControlPlane, staticPodPrefix string, node corev1.Node, owningMachine *clusterv1.Machine, conditionType clusterv1.ConditionType) (bool, error) {
+func (w *Workload) checkPodStatusAndUpdateCondition(staticPodPrefix string, node corev1.Node, owningMachine *clusterv1.Machine, conditionType clusterv1.ConditionType) (bool, error) {
 	staticPodKey := ctrlclient.ObjectKey{
 		Namespace: metav1.NamespaceSystem,
 		Name:      staticPodName(staticPodPrefix, node.Name),
@@ -210,9 +210,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(controlPlane *ControlPlane,
 		// If there is an error getting the Pod, do not set any conditions.
 		if apierrors.IsNotFound(err) {
 			if owningMachine != nil {
-				if _, ok := controlPlane.MachineConditions[owningMachine.Name]; ok {
-					controlPlane.MachineConditions[owningMachine.Name][conditionType] = conditions.FalseCondition(conditionType, clusterv1.PodMissingReason, ConditionReason(clusterv1.PodMissingReason).GetSeverity(), "")
-				}
+				conditions.Set(owningMachine, conditions.FalseCondition(conditionType, clusterv1.PodMissingReason, ConditionReason(clusterv1.PodMissingReason).GetSeverity(), ""))
 			}
 			return false, errors.Errorf("static pod %s is missing", staticPodPrefix)
 		}
@@ -223,7 +221,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(controlPlane *ControlPlane,
 	if err := checkStaticPodReadyCondition(staticPod); err != nil {
 		if checkStaticPodFailedPhase(staticPod) {
 			if owningMachine != nil {
-				controlPlane.MachineConditions[owningMachine.Name][conditionType] = conditions.FalseCondition(conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "")
+				conditions.Set(owningMachine, conditions.FalseCondition(conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), ""))
 			}
 			return false, errors.Errorf("static pod %s is failed", staticPodPrefix)
 		}
@@ -231,7 +229,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(controlPlane *ControlPlane,
 		// Check if the Pod is in Pending state
 		if checkStaticPodProvisioning(staticPod) {
 			if owningMachine != nil {
-				controlPlane.MachineConditions[owningMachine.Name][conditionType] = conditions.FalseCondition(conditionType, clusterv1.PodProvisioningReason, ConditionReason(clusterv1.PodProvisioningReason).GetSeverity(), "")
+				conditions.Set(owningMachine, conditions.FalseCondition(conditionType, clusterv1.PodProvisioningReason, ConditionReason(clusterv1.PodProvisioningReason).GetSeverity(), ""))
 			}
 			// This is not an error case.
 			return false, nil
@@ -242,7 +240,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(controlPlane *ControlPlane,
 		// Non-nil podProvisioningState means there is at least one container in waiting state.
 		if podProvisioningState != "" {
 			if owningMachine != nil {
-				controlPlane.MachineConditions[owningMachine.Name][conditionType] = conditions.FalseCondition(conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "Pod is provisioned but not ready.")
+				conditions.Set(owningMachine, conditions.FalseCondition(conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "Pod is provisioned but not ready."))
 			}
 			return false, errors.Errorf("static pod %s is provisioned but still is not ready", staticPodPrefix)
 		}
@@ -251,7 +249,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(controlPlane *ControlPlane,
 	}
 
 	if owningMachine != nil {
-		controlPlane.MachineConditions[owningMachine.Name][conditionType] = conditions.TrueCondition(conditionType)
+		conditions.Set(owningMachine, conditions.TrueCondition(conditionType))
 	}
 	return true, nil
 }
diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go
index 0d4f7595d1bb..9b437b95a3f0 100644
--- a/controlplane/kubeadm/internal/workload_cluster_etcd.go
+++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go
@@ -51,7 +51,6 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 	expectedMembers := 0
 	response := make(map[string]error)
 	var owningMachine *clusterv1.Machine
-	owningMachine = nil
 
 	// Initial assumtion is that etcd cluster is healthy. If otherwise is observed below, it is set to false.
 	conditions.Set(controlPlane.KCP, conditions.TrueCondition(controlplanev1.EtcdClusterHealthy))
@@ -68,7 +67,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 			if m.Spec.ProviderID != nil && *m.Spec.ProviderID == node.Spec.ProviderID {
 				owningMachine = m
 				// Only set this condition if the node has an owning machine.
-				controlPlane.MachineConditions[owningMachine.Name][clusterv1.MachineEtcdMemberHealthyCondition] = conditions.TrueCondition(clusterv1.MachineEtcdMemberHealthyCondition)
+				conditions.Set(owningMachine, conditions.TrueCondition(clusterv1.MachineEtcdMemberHealthyCondition))
 				break
 			}
 		}
@@ -76,7 +75,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		// TODO: If owning machine is nil, should not continue. But this change breaks the logic below.
 
 		// Check etcd pod's health
-		isEtcdPodHealthy, _ := w.checkPodStatusAndUpdateCondition(controlPlane, EtcdPodNamePrefix, node, owningMachine, clusterv1.MachineEtcdPodHealthyCondition)
+		isEtcdPodHealthy, _ := w.checkPodStatusAndUpdateCondition(EtcdPodNamePrefix, node, owningMachine, clusterv1.MachineEtcdPodHealthyCondition)
 		// isEtcdPodHealthy is false, if Pod is not Ready or there is a client error.
 		if !isEtcdPodHealthy {
 			// Nothing wrong here, etcd on this node is just not running.
@@ -93,7 +92,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		etcdClient, err := w.etcdClientGenerator.forNodes(ctx, []corev1.Node{node})
 		if err != nil {
 			if owningMachine != nil {
-				controlPlane.MachineConditions[owningMachine.Name][clusterv1.MachineEtcdMemberHealthyCondition] = conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure.")
+				conditions.Set(owningMachine, conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure."))
 			}
 			response[name] = errors.Wrap(err, "failed to create etcd client")
 			continue
@@ -104,7 +103,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		members, err := etcdClient.Members(ctx)
 		if err != nil {
 			if owningMachine != nil {
-				controlPlane.MachineConditions[owningMachine.Name][clusterv1.MachineEtcdMemberHealthyCondition] = conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure.")
+				conditions.Set(owningMachine, conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure."))
 			}
 			response[name] = errors.Wrap(err, "failed to list etcd members using etcd client")
 			continue
@@ -115,7 +114,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		// Check that the member reports no alarms.
 		if len(member.Alarms) > 0 {
 			if owningMachine != nil {
-				controlPlane.MachineConditions[owningMachine.Name][clusterv1.MachineEtcdMemberHealthyCondition] = conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd member has alarms.")
+				conditions.Set(owningMachine, conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd member has alarms."))
 			}
 			response[name] = errors.Errorf("etcd member reports alarms: %v", member.Alarms)
 			continue

From 849c5d336a98a6d0dafe47024c6f0ddb9c73700c Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Mon, 28 Sep 2020 09:46:45 -0700
Subject: [PATCH 03/12] Address comments

---
 controlplane/kubeadm/controllers/controller.go | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index 1eb53193484b..c04c2a4de2a2 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -449,11 +449,6 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M
 	return nil
 }
 
-func patchControlPlaneMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine) error {
-	// Patch the object, ignoring conflicts on the conditions owned by this controller.
-	return patchHelper.Patch(ctx, machine)
-}
-
 // reconcileControlPlaneHealth performs health checks for control plane components and etcd
 // It removes any etcd members that do not have a corresponding node.
 // Also, as a final step, checks if there is any machines that is being deleted.
@@ -465,17 +460,17 @@ func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.
 		return ctrl.Result{}, nil
 	}
 
-	for _, m := range controlPlane.Machines {
+	for i := range controlPlane.Machines {
+		m := controlPlane.Machines[i]
 		// Initialize the patch helper.
 		patchHelper, err := patch.NewHelper(m, r.Client)
 		if err != nil {
 			return ctrl.Result{}, err
 		}
 
-		machine := m
 		defer func() {
 			// Always attempt to Patch the Machine conditions after each health reconciliation.
-			if err := patchControlPlaneMachine(ctx, patchHelper, machine); err != nil {
+			if err := patchHelper.Patch(ctx, m); err != nil {
 				logger.Error(err, "Failed to patch KubeadmControlPlane Machine")
 			}
 		}()

From f70db9ed9aa6c24a5025833ad29cc3d3751b6ff7 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Mon, 28 Sep 2020 14:55:19 -0700
Subject: [PATCH 04/12] Minor changes.

---
 .../kubeadm/controllers/controller.go         |  2 +-
 .../kubeadm/internal/workload_cluster.go      | 10 ++++----
 .../kubeadm/internal/workload_cluster_etcd.go | 23 +++++++++++--------
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index c04c2a4de2a2..7c2e6dfa136f 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -471,7 +471,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.
 		defer func() {
 			// Always attempt to Patch the Machine conditions after each health reconciliation.
 			if err := patchHelper.Patch(ctx, m); err != nil {
-				logger.Error(err, "Failed to patch KubeadmControlPlane Machine")
+				logger.Error(err, "Failed to patch KubeadmControlPlane Machine", "machine", m.Name)
 			}
 		}()
 	}
diff --git a/controlplane/kubeadm/internal/workload_cluster.go b/controlplane/kubeadm/internal/workload_cluster.go
index 4b8e15a0ab26..11192606cd5e 100644
--- a/controlplane/kubeadm/internal/workload_cluster.go
+++ b/controlplane/kubeadm/internal/workload_cluster.go
@@ -210,7 +210,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(staticPodPrefix string, node
 		// If there is an error getting the Pod, do not set any conditions.
 		if apierrors.IsNotFound(err) {
 			if owningMachine != nil {
-				conditions.Set(owningMachine, conditions.FalseCondition(conditionType, clusterv1.PodMissingReason, ConditionReason(clusterv1.PodMissingReason).GetSeverity(), ""))
+				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodMissingReason, ConditionReason(clusterv1.PodMissingReason).GetSeverity(), "")
 			}
 			return false, errors.Errorf("static pod %s is missing", staticPodPrefix)
 		}
@@ -221,7 +221,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(staticPodPrefix string, node
 	if err := checkStaticPodReadyCondition(staticPod); err != nil {
 		if checkStaticPodFailedPhase(staticPod) {
 			if owningMachine != nil {
-				conditions.Set(owningMachine, conditions.FalseCondition(conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), ""))
+				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "")
 			}
 			return false, errors.Errorf("static pod %s is failed", staticPodPrefix)
 		}
@@ -229,7 +229,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(staticPodPrefix string, node
 		// Check if the Pod is in Pending state
 		if checkStaticPodProvisioning(staticPod) {
 			if owningMachine != nil {
-				conditions.Set(owningMachine, conditions.FalseCondition(conditionType, clusterv1.PodProvisioningReason, ConditionReason(clusterv1.PodProvisioningReason).GetSeverity(), ""))
+				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodProvisioningReason, ConditionReason(clusterv1.PodProvisioningReason).GetSeverity(), "")
 			}
 			// This is not an error case.
 			return false, nil
@@ -240,7 +240,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(staticPodPrefix string, node
 		// Non-nil podProvisioningState means there is at least one container in waiting state.
 		if podProvisioningState != "" {
 			if owningMachine != nil {
-				conditions.Set(owningMachine, conditions.FalseCondition(conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "Pod is provisioned but not ready."))
+				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "Pod is provisioned but not ready.")
 			}
 			return false, errors.Errorf("static pod %s is provisioned but still is not ready", staticPodPrefix)
 		}
@@ -249,7 +249,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(staticPodPrefix string, node
 	}
 
 	if owningMachine != nil {
-		conditions.Set(owningMachine, conditions.TrueCondition(conditionType))
+		conditions.MarkTrue(owningMachine, conditionType)
 	}
 	return true, nil
 }
diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go
index 9b437b95a3f0..6f6f51a26adf 100644
--- a/controlplane/kubeadm/internal/workload_cluster_etcd.go
+++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go
@@ -53,7 +53,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 	var owningMachine *clusterv1.Machine
 
 	// Initial assumtion is that etcd cluster is healthy. If otherwise is observed below, it is set to false.
-	conditions.Set(controlPlane.KCP, conditions.TrueCondition(controlplanev1.EtcdClusterHealthy))
+	conditions.MarkTrue(controlPlane.KCP, controlplanev1.EtcdClusterHealthy)
 
 	for _, node := range controlPlaneNodes.Items {
 		name := node.Name
@@ -67,7 +67,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 			if m.Spec.ProviderID != nil && *m.Spec.ProviderID == node.Spec.ProviderID {
 				owningMachine = m
 				// Only set this condition if the node has an owning machine.
-				conditions.Set(owningMachine, conditions.TrueCondition(clusterv1.MachineEtcdMemberHealthyCondition))
+				conditions.MarkTrue(owningMachine, clusterv1.MachineEtcdMemberHealthyCondition)
 				break
 			}
 		}
@@ -92,7 +92,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		etcdClient, err := w.etcdClientGenerator.forNodes(ctx, []corev1.Node{node})
 		if err != nil {
 			if owningMachine != nil {
-				conditions.Set(owningMachine, conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure."))
+				conditions.MarkFalse(owningMachine, clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure.")
 			}
 			response[name] = errors.Wrap(err, "failed to create etcd client")
 			continue
@@ -103,7 +103,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		members, err := etcdClient.Members(ctx)
 		if err != nil {
 			if owningMachine != nil {
-				conditions.Set(owningMachine, conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure."))
+				conditions.MarkFalse(owningMachine, clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure.")
 			}
 			response[name] = errors.Wrap(err, "failed to list etcd members using etcd client")
 			continue
@@ -114,7 +114,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		// Check that the member reports no alarms.
 		if len(member.Alarms) > 0 {
 			if owningMachine != nil {
-				conditions.Set(owningMachine, conditions.FalseCondition(clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd member has alarms."))
+				conditions.MarkFalse(owningMachine, clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd member has alarms.")
 			}
 			response[name] = errors.Errorf("etcd member reports alarms: %v", member.Alarms)
 			continue
@@ -136,7 +136,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		} else {
 			unknownMembers := memberIDSet.Difference(knownMemberIDSet)
 			if unknownMembers.Len() > 0 {
-				conditions.Set(controlPlane.KCP, conditions.FalseCondition(controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd members do not have the same member-list view"))
+				conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd members do not have the same member-list view")
 				response[name] = errors.Errorf("etcd member reports members IDs %v, but all previously seen etcd members reported member IDs %v", memberIDSet.UnsortedList(), knownMemberIDSet.UnsortedList())
 			}
 			continue
@@ -146,24 +146,27 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 	// Check etcd cluster alarms
 	etcdClient, err := w.etcdClientGenerator.forNodes(ctx, controlPlaneNodes.Items)
 	if err == nil {
+		defer etcdClient.Close()
 		alarmList, err := etcdClient.Alarms(ctx)
 		if len(alarmList) > 0 || err != nil {
-			conditions.Set(controlPlane.KCP, conditions.FalseCondition(controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd cluster has alarms."))
+			conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd cluster has alarms.")
+			return response, errors.Errorf("etcd cluster has %d alarms", len(alarmList))
 		}
 	}
-	defer etcdClient.Close()
 
 	// TODO: ensure that each pod is owned by a node that we're managing. That would ensure there are no out-of-band etcd members
 
 	// Check that there is exactly one etcd member for every healthy pod.
 	// This allows us to handle the expected case where there is a failing pod but it's been removed from the member list.
 	if expectedMembers != len(knownMemberIDSet) {
-		conditions.Set(controlPlane.KCP, conditions.FalseCondition(controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd pods does not match with etcd members."))
+		conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd pods does not match with etcd members.")
 		return response, errors.Errorf("there are %d healthy etcd pods, but %d etcd members", expectedMembers, len(knownMemberIDSet))
 	}
 
+	// TODO: During provisioning, this condition may be set false for a short time until all pods are provisioned, can add additional checks here to prevent this.
 	if len(knownMemberIDSet) < (controlPlane.Machines.Len()/2.0 + 1) {
-		conditions.Set(controlPlane.KCP, conditions.FalseCondition(controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd cluster's quorum is lost."))
+		conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd cluster's quorum is lost.")
+		return response, errors.Errorf("etcd lost its quorum: there are %d control-plane machines, but %d etcd members", controlPlane.Machines.Len(), len(knownMemberIDSet))
 	}
 	return response, nil
 }

From efba2231fe1dca809aef43316f19eb0f31298c92 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Tue, 29 Sep 2020 09:41:15 -0700
Subject: [PATCH 05/12] Address comments

---
 .../kubeadm/internal/workload_cluster.go         | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/controlplane/kubeadm/internal/workload_cluster.go b/controlplane/kubeadm/internal/workload_cluster.go
index 11192606cd5e..ded2caa961c2 100644
--- a/controlplane/kubeadm/internal/workload_cluster.go
+++ b/controlplane/kubeadm/internal/workload_cluster.go
@@ -227,7 +227,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(staticPodPrefix string, node
 		}
 
 		// Check if the Pod is in Pending state
-		if checkStaticPodProvisioning(staticPod) {
+		if isStaticPodProvisioning(staticPod) {
 			if owningMachine != nil {
 				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodProvisioningReason, ConditionReason(clusterv1.PodProvisioningReason).GetSeverity(), "")
 			}
@@ -240,7 +240,7 @@ func (w *Workload) checkPodStatusAndUpdateCondition(staticPodPrefix string, node
 		// Non-nil podProvisioningState means there is at least one container in waiting state.
 		if podProvisioningState != "" {
 			if owningMachine != nil {
-				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "Pod is provisioned but not ready.")
+				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "Pod is provisioned but not ready: %s", podProvisioningState)
 			}
 			return false, errors.Errorf("static pod %s is provisioned but still is not ready", staticPodPrefix)
 		}
@@ -471,18 +471,18 @@ func checkStaticPodFailedPhase(pod corev1.Pod) bool {
 }
 
 // If Pod is in Pending phase and PodScheduled or Initialized condition is set to false, then pod is in provisioning state.
-func checkStaticPodProvisioning(pod corev1.Pod) bool {
+func isStaticPodProvisioning(pod corev1.Pod) bool {
 	// Pod being not in Pending phase means it has already provisioned and running or failed or in unknown phase.
 	if pod.Status.Phase != corev1.PodPending {
 		return false
 	}
 
 	for _, condition := range pod.Status.Conditions {
-		if condition.Type == corev1.PodScheduled && condition.Status != corev1.ConditionTrue {
-			return true
-		}
-		if condition.Type == corev1.PodInitialized && condition.Status != corev1.ConditionTrue {
-			return true
+		if condition.Status != corev1.ConditionTrue {
+			switch condition.Type {
+			case corev1.PodScheduled, corev1.PodInitialized:
+				return true
+			}
 		}
 	}
 	return false

From bb8c4b5fe9c9ddc1c268ab0d449ebb03a6a7d212 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Tue, 29 Sep 2020 22:08:08 -0700
Subject: [PATCH 06/12] Modify reconcilePodStatusCondition return type

---
 controlplane/kubeadm/internal/cluster_test.go |   4 +-
 .../kubeadm/internal/workload_cluster.go      | 131 +++++++++---------
 .../kubeadm/internal/workload_cluster_etcd.go |   6 +-
 3 files changed, 73 insertions(+), 68 deletions(-)

diff --git a/controlplane/kubeadm/internal/cluster_test.go b/controlplane/kubeadm/internal/cluster_test.go
index f781249a614c..f4193244ac5f 100644
--- a/controlplane/kubeadm/internal/cluster_test.go
+++ b/controlplane/kubeadm/internal/cluster_test.go
@@ -64,7 +64,7 @@ func TestCheckStaticPodReadyCondition(t *testing.T) {
 				Spec:   corev1.PodSpec{},
 				Status: corev1.PodStatus{Conditions: test.conditions},
 			}
-			g.Expect(checkStaticPodReadyCondition(pod)).To(Succeed())
+			g.Expect(isStaticPodInReadyCondition(pod)).To(Succeed())
 		})
 	}
 }
@@ -90,7 +90,7 @@ func TestCheckStaticPodNotReadyCondition(t *testing.T) {
 				Spec:   corev1.PodSpec{},
 				Status: corev1.PodStatus{Conditions: test.conditions},
 			}
-			g.Expect(checkStaticPodReadyCondition(pod)).NotTo(Succeed())
+			g.Expect(isStaticPodInReadyCondition(pod)).NotTo(Succeed())
 		})
 	}
 }
diff --git a/controlplane/kubeadm/internal/workload_cluster.go b/controlplane/kubeadm/internal/workload_cluster.go
index ded2caa961c2..811dfb5515b1 100644
--- a/controlplane/kubeadm/internal/workload_cluster.go
+++ b/controlplane/kubeadm/internal/workload_cluster.go
@@ -26,7 +26,6 @@ import (
 	"crypto/x509/pkix"
 	"fmt"
 	"math/big"
-	"sigs.k8s.io/cluster-api/util/conditions"
 	"time"
 
 	"github.com/blang/semver"
@@ -35,10 +34,12 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	kerrors "k8s.io/apimachinery/pkg/util/errors"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
 	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/util"
 	"sigs.k8s.io/cluster-api/util/certs"
+	"sigs.k8s.io/cluster-api/util/conditions"
 	containerutil "sigs.k8s.io/cluster-api/util/container"
 	"sigs.k8s.io/cluster-api/util/patch"
 	ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
@@ -66,16 +67,16 @@ type ConditionReason string
 
 // GetSeverity returns the default severity for the condition reasons for the conditions owned by KCP.
 func (c ConditionReason) GetSeverity() clusterv1.ConditionSeverity {
-	switch {
-	case c == ConditionReason(clusterv1.PodProvisioningReason):
+	switch c {
+	case clusterv1.PodProvisioningReason:
 		return clusterv1.ConditionSeverityInfo
-	case c == ConditionReason(clusterv1.PodMissingReason):
+	case clusterv1.PodMissingReason:
 		return clusterv1.ConditionSeverityWarning
-	case c == ConditionReason(clusterv1.PodFailedReason):
+	case clusterv1.PodFailedReason:
 		return clusterv1.ConditionSeverityError
-	case c == ConditionReason(clusterv1.EtcdMemberUnhealthyReason):
+	case clusterv1.EtcdMemberUnhealthyReason:
 		return clusterv1.ConditionSeverityError
-	case c == ConditionReason(controlplanev1.EtcdClusterUnhealthyReason):
+	case controlplanev1.EtcdClusterUnhealthyReason:
 		return clusterv1.ConditionSeverityWarning
 	default:
 		return clusterv1.ConditionSeverityInfo
@@ -164,28 +165,29 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context, controlPlane *Cont
 		}
 
 		// If there is any problem with the checks below, set response to any one of the problems.
-		var anyError error = nil
-		if err := checkNodeNoExecuteCondition(node); err != nil {
-			anyError = err
+		errList := []error{}
+
+		if err := checkNodeNoExecuteTaint(node); err != nil {
+			errList = append(errList, err)
 		}
 
 		// Check kube-api-server health
-		if _, err = w.checkPodStatusAndUpdateCondition(KubeAPIServerPodNamePrefix, node, owningMachine, clusterv1.MachineAPIServerPodHealthyCondition); err != nil {
-			anyError = err
+		if _, err = w.reconcilePodStatusCondition(KubeAPIServerPodNamePrefix, node, owningMachine, clusterv1.MachineAPIServerPodHealthyCondition); err != nil {
+			errList = append(errList, err)
 		}
 
 		// Check kube-controller-manager health
-		if _, err = w.checkPodStatusAndUpdateCondition(KubeControllerManagerPodNamePrefix, node, owningMachine, clusterv1.MachineControllerManagerHealthyCondition); err != nil {
-			anyError = err
+		if _, err = w.reconcilePodStatusCondition(KubeControllerManagerPodNamePrefix, node, owningMachine, clusterv1.MachineControllerManagerHealthyCondition); err != nil {
+			errList = append(errList, err)
 		}
 
 		// Check kube-scheduler health
-		if _, err = w.checkPodStatusAndUpdateCondition(KubeSchedulerHealthyPodNamePrefix, node, owningMachine, clusterv1.MachineSchedulerPodHealthyCondition); err != nil {
-			anyError = err
+		if _, err = w.reconcilePodStatusCondition(KubeSchedulerHealthyPodNamePrefix, node, owningMachine, clusterv1.MachineSchedulerPodHealthyCondition); err != nil {
+			errList = append(errList, err)
 		}
 
-		if anyError != nil {
-			response[name] = err
+		if len(errList) > 0 {
+			response[name] = kerrors.NewAggregate(errList)
 			continue
 		}
 	}
@@ -193,13 +195,14 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context, controlPlane *Cont
 	return response, nil
 }
 
-// checkPodStatusAndUpdateConditions returns true, only if the Pod is in Running/Ready condition.
-// There is a slight difference between returning false and error:
-// checkPodStatusAndUpdateConditions returns false, if Pod is not in Ready Condition or there is a client error.
-// checkPodStatusAndUpdateConditions returns error only when it is obvious that the Pod is not in Running/Ready state, so it does not return error on transient client errors.
-// For instance, returning (false, nil) means Pod is not ready but is not in error state as well,
-// returning (false, err) means Pod is not ready and it is in error state as well.
-func (w *Workload) checkPodStatusAndUpdateCondition(staticPodPrefix string, node corev1.Node, owningMachine *clusterv1.Machine, conditionType clusterv1.ConditionType) (bool, error) {
+type podState struct {
+	provisioning bool
+	unknown      bool
+	ready        bool
+}
+
+// reconcilePodStatusCondition returns error if all podState states are false, i.e., Node is reachable, provisioned, and not in Ready state.
+func (w *Workload) reconcilePodStatusCondition(staticPodPrefix string, node corev1.Node, owningMachine *clusterv1.Machine, conditionType clusterv1.ConditionType) (podState, error) {
 	staticPodKey := ctrlclient.ObjectKey{
 		Namespace: metav1.NamespaceSystem,
 		Name:      staticPodName(staticPodPrefix, node.Name),
@@ -212,46 +215,48 @@ func (w *Workload) checkPodStatusAndUpdateCondition(staticPodPrefix string, node
 			if owningMachine != nil {
 				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodMissingReason, ConditionReason(clusterv1.PodMissingReason).GetSeverity(), "")
 			}
-			return false, errors.Errorf("static pod %s is missing", staticPodPrefix)
+			return podState{}, errors.Errorf("static pod %s is missing", staticPodPrefix)
 		}
 		// Do not return error here, because this may be a transient error.
-		return false, nil
+		return podState{unknown: true}, nil
 	}
 
-	if err := checkStaticPodReadyCondition(staticPod); err != nil {
-		if checkStaticPodFailedPhase(staticPod) {
-			if owningMachine != nil {
-				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "")
-			}
-			return false, errors.Errorf("static pod %s is failed", staticPodPrefix)
+	// If Pod is Ready, no need to check the rest of the states below.
+	if err := isStaticPodInReadyCondition(staticPod); err == nil {
+		if owningMachine != nil {
+			conditions.MarkTrue(owningMachine, conditionType)
 		}
+		return podState{ready: true}, nil
+	}
 
-		// Check if the Pod is in Pending state
-		if isStaticPodProvisioning(staticPod) {
-			if owningMachine != nil {
-				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodProvisioningReason, ConditionReason(clusterv1.PodProvisioningReason).GetSeverity(), "")
-			}
-			// This is not an error case.
-			return false, nil
-		}
+	// If Pod is not in Ready state, it can be in Provisioning or Failed state.
 
-		// If Pod is not provisioning state, check if still in Pending Phase.
-		podProvisioningState := checkStaticPodAfterProvisioningState(staticPod)
-		// Non-nil podProvisioningState means there is at least one container in waiting state.
-		if podProvisioningState != "" {
-			if owningMachine != nil {
-				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "Pod is provisioned but not ready: %s", podProvisioningState)
-			}
-			return false, errors.Errorf("static pod %s is provisioned but still is not ready", staticPodPrefix)
+	if isStaticPodInFailedPhase(staticPod) {
+		if owningMachine != nil {
+			conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "")
 		}
+		return podState{}, errors.Errorf("static pod %s is failed", staticPodPrefix)
+	}
 
-		return false, errors.Errorf("static pod %s is not ready", staticPodPrefix)
+	// Check if the Pod is in Pending state due to provisioning.
+	if isStaticPodProvisioning(staticPod) {
+		if owningMachine != nil {
+			conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodProvisioningReason, ConditionReason(clusterv1.PodProvisioningReason).GetSeverity(), "")
+		}
+		// This is not an error case.
+		return podState{provisioning: true}, nil
 	}
 
-	if owningMachine != nil {
-		conditions.MarkTrue(owningMachine, conditionType)
+	// Pod is provisioned, but if it is still in Pending Phase, surface the errors in Containers.
+	// Non-nil error means there is at least one container in waiting state.
+	if err := checkStaticPodAfterProvisioningState(staticPod); err != nil {
+		if owningMachine != nil {
+			conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "Pod is provisioned but not ready: %s", err.Error())
+		}
+		return podState{}, errors.Errorf("static pod %s is provisioned but still is not ready", staticPodPrefix)
 	}
-	return true, nil
+
+	return podState{}, errors.Errorf("static pod %s is not ready", staticPodPrefix)
 }
 
 // UpdateKubernetesVersionInKubeadmConfigMap updates the kubernetes version in the kubeadm config map.
@@ -448,14 +453,14 @@ func staticPodName(component, nodeName string) string {
 	return fmt.Sprintf("%s-%s", component, nodeName)
 }
 
-func checkStaticPodReadyCondition(pod corev1.Pod) error {
+func isStaticPodInReadyCondition(pod corev1.Pod) error {
 	found := false
 	for _, condition := range pod.Status.Conditions {
 		if condition.Type == corev1.PodReady {
 			found = true
-		}
-		if condition.Type == corev1.PodReady && condition.Status != corev1.ConditionTrue {
-			return errors.Errorf("static pod %s/%s is not ready", pod.Namespace, pod.Name)
+			if condition.Status != corev1.ConditionTrue {
+				return errors.Errorf("static pod %s/%s is not ready", pod.Namespace, pod.Name)
+			}
 		}
 	}
 
@@ -466,7 +471,7 @@ func checkStaticPodReadyCondition(pod corev1.Pod) error {
 	return nil
 }
 
-func checkStaticPodFailedPhase(pod corev1.Pod) bool {
+func isStaticPodInFailedPhase(pod corev1.Pod) bool {
 	return pod.Status.Phase == corev1.PodFailed
 }
 
@@ -491,21 +496,21 @@ func isStaticPodProvisioning(pod corev1.Pod) bool {
 // If Pod is in Pending phase, there may be some failing containers in the pod.
 // This function must be called when PodScheduled and Initialized condition is set to true. If a Pod is initialized but still in Pending state,
 // returns non-nil string with the reason of the container in waiting state.
-func checkStaticPodAfterProvisioningState(pod corev1.Pod) string {
+func checkStaticPodAfterProvisioningState(pod corev1.Pod) error {
 	// Pod being not in Pending phase means it has already provisioned and running or failed or in unknown phase.
 	if pod.Status.Phase != corev1.PodPending {
-		return ""
+		return nil
 	}
 
 	for _, containerStatus := range pod.Status.ContainerStatuses {
 		if containerStatus.State.Waiting != nil {
-			return containerStatus.State.Waiting.Reason
+			return errors.New(containerStatus.State.Waiting.Reason)
 		}
 	}
-	return ""
+	return nil
 }
 
-func checkNodeNoExecuteCondition(node corev1.Node) error {
+func checkNodeNoExecuteTaint(node corev1.Node) error {
 	for _, taint := range node.Spec.Taints {
 		if taint.Key == corev1.TaintNodeUnreachable && taint.Effect == corev1.TaintEffectNoExecute {
 			return errors.Errorf("node has NoExecute taint: %v", node.Name)
diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go
index 6f6f51a26adf..876db0221450 100644
--- a/controlplane/kubeadm/internal/workload_cluster_etcd.go
+++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go
@@ -75,9 +75,9 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		// TODO: If owning machine is nil, should not continue. But this change breaks the logic below.
 
 		// Check etcd pod's health
-		isEtcdPodHealthy, _ := w.checkPodStatusAndUpdateCondition(EtcdPodNamePrefix, node, owningMachine, clusterv1.MachineEtcdPodHealthyCondition)
-		// isEtcdPodHealthy is false, if Pod is not Ready or there is a client error.
-		if !isEtcdPodHealthy {
+		etcdPodState, _ := w.reconcilePodStatusCondition(EtcdPodNamePrefix, node, owningMachine, clusterv1.MachineEtcdPodHealthyCondition)
+		// etcdPodState is
+		if !etcdPodState.ready {
 			// Nothing wrong here, etcd on this node is just not running.
 			// If it's a true failure the healthcheck will fail since it won't have checked enough members.
 			response[name] = errors.Wrap(err, "etcd pod is not ready")

From e2d6f83e52da4eda35fb186f3ae0aa2703fa6d06 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Thu, 1 Oct 2020 23:01:02 -0700
Subject: [PATCH 07/12] Removed GetSeverity()

---
 api/v1alpha3/condition_consts.go              | 49 -------------------
 .../kubeadm/api/v1alpha3/condition_consts.go  | 49 +++++++++++++++++++
 .../kubeadm/internal/workload_cluster.go      | 34 +++----------
 .../kubeadm/internal/workload_cluster_etcd.go | 18 +++----
 4 files changed, 65 insertions(+), 85 deletions(-)

diff --git a/api/v1alpha3/condition_consts.go b/api/v1alpha3/condition_consts.go
index 9b85e76aff9b..5e2e59f815a6 100644
--- a/api/v1alpha3/condition_consts.go
+++ b/api/v1alpha3/condition_consts.go
@@ -128,52 +128,3 @@ const (
 	// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
 	WaitingForRemediationReason = "WaitingForRemediation"
 )
-
-// Common Pod-related Condition Reasons used by Pod-related Conditions such as MachineAPIServerPodHealthyCondition etc.
-const (
-	// PodProvisioningReason (Severity=Info) documents a pod waiting  to be provisioned i.e., Pod is in "Pending" phase and
-	// PodScheduled and Initialized conditions are not yet set to True.
-	PodProvisioningReason = "PodProvisioning"
-
-	// PodMissingReason (Severity=Warning) documents a pod does not exist.
-	PodMissingReason = "PodMissing"
-
-	// PodFailedReason (Severity=Error) documents if
-	// i) a pod failed during provisioning i.e., Pod is in "Pending" phase and
-	// PodScheduled and Initialized conditions are set to True but ContainersReady or Ready condition is false
-	// (i.e., at least one of the containers are in waiting state(e.g CrashLoopbackOff, ImagePullBackOff)
-	// ii) a pod has at least one container that is terminated with a failure and hence Pod is in "Failed" phase.
-	PodFailedReason = "PodFailed"
-)
-
-// Conditions that are only for control-plane machines. KubeadmControlPlane is the owner of these conditions.
-
-const (
-	// MachineAPIServerPodHealthyCondition reports a machine's kube-apiserver's health status.
-	// Set to true if kube-apiserver pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
-	MachineAPIServerPodHealthyCondition ConditionType = "APIServerPodHealthy"
-
-	// MachineControllerManagerHealthyCondition reports a machine's kube-controller-manager's health status.
-	// Set to true if kube-controller-manager pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
-	MachineControllerManagerHealthyCondition ConditionType = "ControllerManagerPodHealthy"
-
-	// MachineSchedulerPodHealthyCondition reports a machine's kube-scheduler's health status.
-	// Set to true if kube-scheduler pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
-	MachineSchedulerPodHealthyCondition ConditionType = "SchedulerPodHealthy"
-
-	// MachineEtcdPodHealthyCondition reports a machine's etcd pod's health status.
-	// Set to true if etcd pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
-	MachineEtcdPodHealthyCondition ConditionType = "EtcdPodHealthy"
-)
-
-const (
-	// MachineEtcdMemberHealthyCondition documents if the machine has an healthy etcd member.
-	// If not true, Pod-related Condition Reasons can be used as reasons.
-	MachineEtcdMemberHealthyCondition ConditionType = "EtcdMemberHealthy"
-
-	// EtcdMemberUnhealthyReason (Severity=Error) documents a Machine's etcd member is unhealthy for a number of reasons:
-	// i) etcd member has alarms.
-	// ii) creating etcd client fails or using the created etcd client to perform some operations fails.
-	// iii) Quorum is lost
-	EtcdMemberUnhealthyReason = "EtcdMemberUnhealthy"
-)
diff --git a/controlplane/kubeadm/api/v1alpha3/condition_consts.go b/controlplane/kubeadm/api/v1alpha3/condition_consts.go
index ea7ece99ec5c..2db9020a1a3a 100644
--- a/controlplane/kubeadm/api/v1alpha3/condition_consts.go
+++ b/controlplane/kubeadm/api/v1alpha3/condition_consts.go
@@ -77,3 +77,52 @@ const (
 	// iii) if etcd pods do not match with etcd members.
 	EtcdClusterUnhealthyReason = "EtcdClusterUnhealthy"
 )
+
+// Common Pod-related Condition Reasons used by Pod-related Conditions such as MachineAPIServerPodHealthyCondition etc.
+const (
+	// PodProvisioningReason (Severity=Info) documents a pod waiting  to be provisioned i.e., Pod is in "Pending" phase and
+	// PodScheduled and Initialized conditions are not yet set to True.
+	PodProvisioningReason = "PodProvisioning"
+
+	// PodMissingReason (Severity=Warning) documents a pod does not exist.
+	PodMissingReason = "PodMissing"
+
+	// PodFailedReason (Severity=Error) documents if
+	// i) a pod failed during provisioning i.e., Pod is in "Pending" phase and
+	// PodScheduled and Initialized conditions are set to True but ContainersReady or Ready condition is false
+	// (i.e., at least one of the containers are in waiting state(e.g CrashLoopbackOff, ImagePullBackOff)
+	// ii) a pod has at least one container that is terminated with a failure and hence Pod is in "Failed" phase.
+	PodFailedReason = "PodFailed"
+)
+
+// Conditions that are only for control-plane machines. KubeadmControlPlane is the owner of these conditions.
+
+const (
+	// MachineAPIServerPodHealthyCondition reports a machine's kube-apiserver's health status.
+	// Set to true if kube-apiserver pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
+	MachineAPIServerPodHealthyCondition clusterv1.ConditionType = "APIServerPodHealthy"
+
+	// MachineControllerManagerHealthyCondition reports a machine's kube-controller-manager's health status.
+	// Set to true if kube-controller-manager pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
+	MachineControllerManagerHealthyCondition clusterv1.ConditionType = "ControllerManagerPodHealthy"
+
+	// MachineSchedulerPodHealthyCondition reports a machine's kube-scheduler's health status.
+	// Set to true if kube-scheduler pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
+	MachineSchedulerPodHealthyCondition clusterv1.ConditionType = "SchedulerPodHealthy"
+
+	// MachineEtcdPodHealthyCondition reports a machine's etcd pod's health status.
+	// Set to true if etcd pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
+	MachineEtcdPodHealthyCondition clusterv1.ConditionType = "EtcdPodHealthy"
+)
+
+const (
+	// MachineEtcdMemberHealthyCondition documents if the machine has an healthy etcd member.
+	// If not true, Pod-related Condition Reasons can be used as reasons.
+	MachineEtcdMemberHealthyCondition clusterv1.ConditionType = "EtcdMemberHealthy"
+
+	// EtcdMemberUnhealthyReason (Severity=Error) documents a Machine's etcd member is unhealthy for a number of reasons:
+	// i) etcd member has alarms.
+	// ii) creating etcd client fails or using the created etcd client to perform some operations fails.
+	// iii) Quorum is lost
+	EtcdMemberUnhealthyReason = "EtcdMemberUnhealthy"
+)
diff --git a/controlplane/kubeadm/internal/workload_cluster.go b/controlplane/kubeadm/internal/workload_cluster.go
index 811dfb5515b1..b9545cd591cf 100644
--- a/controlplane/kubeadm/internal/workload_cluster.go
+++ b/controlplane/kubeadm/internal/workload_cluster.go
@@ -63,26 +63,6 @@ const (
 	EtcdPodNamePrefix                  = "etcd"
 )
 
-type ConditionReason string
-
-// GetSeverity returns the default severity for the condition reasons for the conditions owned by KCP.
-func (c ConditionReason) GetSeverity() clusterv1.ConditionSeverity {
-	switch c {
-	case clusterv1.PodProvisioningReason:
-		return clusterv1.ConditionSeverityInfo
-	case clusterv1.PodMissingReason:
-		return clusterv1.ConditionSeverityWarning
-	case clusterv1.PodFailedReason:
-		return clusterv1.ConditionSeverityError
-	case clusterv1.EtcdMemberUnhealthyReason:
-		return clusterv1.ConditionSeverityError
-	case controlplanev1.EtcdClusterUnhealthyReason:
-		return clusterv1.ConditionSeverityWarning
-	default:
-		return clusterv1.ConditionSeverityInfo
-	}
-}
-
 // WorkloadCluster defines all behaviors necessary to upgrade kubernetes on a workload cluster
 type WorkloadCluster interface {
 	// Basic health and status checks.
@@ -172,17 +152,17 @@ func (w *Workload) ControlPlaneIsHealthy(ctx context.Context, controlPlane *Cont
 		}
 
 		// Check kube-api-server health
-		if _, err = w.reconcilePodStatusCondition(KubeAPIServerPodNamePrefix, node, owningMachine, clusterv1.MachineAPIServerPodHealthyCondition); err != nil {
+		if _, err = w.reconcilePodStatusCondition(KubeAPIServerPodNamePrefix, node, owningMachine, controlplanev1.MachineAPIServerPodHealthyCondition); err != nil {
 			errList = append(errList, err)
 		}
 
 		// Check kube-controller-manager health
-		if _, err = w.reconcilePodStatusCondition(KubeControllerManagerPodNamePrefix, node, owningMachine, clusterv1.MachineControllerManagerHealthyCondition); err != nil {
+		if _, err = w.reconcilePodStatusCondition(KubeControllerManagerPodNamePrefix, node, owningMachine, controlplanev1.MachineControllerManagerHealthyCondition); err != nil {
 			errList = append(errList, err)
 		}
 
 		// Check kube-scheduler health
-		if _, err = w.reconcilePodStatusCondition(KubeSchedulerHealthyPodNamePrefix, node, owningMachine, clusterv1.MachineSchedulerPodHealthyCondition); err != nil {
+		if _, err = w.reconcilePodStatusCondition(KubeSchedulerHealthyPodNamePrefix, node, owningMachine, controlplanev1.MachineSchedulerPodHealthyCondition); err != nil {
 			errList = append(errList, err)
 		}
 
@@ -213,7 +193,7 @@ func (w *Workload) reconcilePodStatusCondition(staticPodPrefix string, node core
 		// If there is an error getting the Pod, do not set any conditions.
 		if apierrors.IsNotFound(err) {
 			if owningMachine != nil {
-				conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodMissingReason, ConditionReason(clusterv1.PodMissingReason).GetSeverity(), "")
+				conditions.MarkFalse(owningMachine, conditionType, controlplanev1.PodMissingReason, clusterv1.ConditionSeverityWarning, "")
 			}
 			return podState{}, errors.Errorf("static pod %s is missing", staticPodPrefix)
 		}
@@ -233,7 +213,7 @@ func (w *Workload) reconcilePodStatusCondition(staticPodPrefix string, node core
 
 	if isStaticPodInFailedPhase(staticPod) {
 		if owningMachine != nil {
-			conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "")
+			conditions.MarkFalse(owningMachine, conditionType, controlplanev1.PodFailedReason, clusterv1.ConditionSeverityError, "")
 		}
 		return podState{}, errors.Errorf("static pod %s is failed", staticPodPrefix)
 	}
@@ -241,7 +221,7 @@ func (w *Workload) reconcilePodStatusCondition(staticPodPrefix string, node core
 	// Check if the Pod is in Pending state due to provisioning.
 	if isStaticPodProvisioning(staticPod) {
 		if owningMachine != nil {
-			conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodProvisioningReason, ConditionReason(clusterv1.PodProvisioningReason).GetSeverity(), "")
+			conditions.MarkFalse(owningMachine, conditionType, controlplanev1.PodProvisioningReason, clusterv1.ConditionSeverityInfo, "")
 		}
 		// This is not an error case.
 		return podState{provisioning: true}, nil
@@ -251,7 +231,7 @@ func (w *Workload) reconcilePodStatusCondition(staticPodPrefix string, node core
 	// Non-nil error means there is at least one container in waiting state.
 	if err := checkStaticPodAfterProvisioningState(staticPod); err != nil {
 		if owningMachine != nil {
-			conditions.MarkFalse(owningMachine, conditionType, clusterv1.PodFailedReason, ConditionReason(clusterv1.PodFailedReason).GetSeverity(), "Pod is provisioned but not ready: %s", err.Error())
+			conditions.MarkFalse(owningMachine, conditionType, controlplanev1.PodFailedReason, clusterv1.ConditionSeverityError, "Pod is provisioned but not ready: %s", err.Error())
 		}
 		return podState{}, errors.Errorf("static pod %s is provisioned but still is not ready", staticPodPrefix)
 	}
diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go
index 876db0221450..11e153cbef3d 100644
--- a/controlplane/kubeadm/internal/workload_cluster_etcd.go
+++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go
@@ -67,7 +67,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 			if m.Spec.ProviderID != nil && *m.Spec.ProviderID == node.Spec.ProviderID {
 				owningMachine = m
 				// Only set this condition if the node has an owning machine.
-				conditions.MarkTrue(owningMachine, clusterv1.MachineEtcdMemberHealthyCondition)
+				conditions.MarkTrue(owningMachine, controlplanev1.MachineEtcdMemberHealthyCondition)
 				break
 			}
 		}
@@ -75,7 +75,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		// TODO: If owning machine is nil, should not continue. But this change breaks the logic below.
 
 		// Check etcd pod's health
-		etcdPodState, _ := w.reconcilePodStatusCondition(EtcdPodNamePrefix, node, owningMachine, clusterv1.MachineEtcdPodHealthyCondition)
+		etcdPodState, _ := w.reconcilePodStatusCondition(EtcdPodNamePrefix, node, owningMachine, controlplanev1.MachineEtcdPodHealthyCondition)
 		// etcdPodState is
 		if !etcdPodState.ready {
 			// Nothing wrong here, etcd on this node is just not running.
@@ -92,7 +92,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		etcdClient, err := w.etcdClientGenerator.forNodes(ctx, []corev1.Node{node})
 		if err != nil {
 			if owningMachine != nil {
-				conditions.MarkFalse(owningMachine, clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure.")
+				conditions.MarkFalse(owningMachine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "etcd client related failure.")
 			}
 			response[name] = errors.Wrap(err, "failed to create etcd client")
 			continue
@@ -103,7 +103,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		members, err := etcdClient.Members(ctx)
 		if err != nil {
 			if owningMachine != nil {
-				conditions.MarkFalse(owningMachine, clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd client related failure.")
+				conditions.MarkFalse(owningMachine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "etcd client related failure.")
 			}
 			response[name] = errors.Wrap(err, "failed to list etcd members using etcd client")
 			continue
@@ -114,7 +114,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		// Check that the member reports no alarms.
 		if len(member.Alarms) > 0 {
 			if owningMachine != nil {
-				conditions.MarkFalse(owningMachine, clusterv1.MachineEtcdMemberHealthyCondition, clusterv1.EtcdMemberUnhealthyReason, ConditionReason(clusterv1.EtcdMemberUnhealthyReason).GetSeverity(), "etcd member has alarms.")
+				conditions.MarkFalse(owningMachine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "etcd member has alarms.")
 			}
 			response[name] = errors.Errorf("etcd member reports alarms: %v", member.Alarms)
 			continue
@@ -136,7 +136,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		} else {
 			unknownMembers := memberIDSet.Difference(knownMemberIDSet)
 			if unknownMembers.Len() > 0 {
-				conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd members do not have the same member-list view")
+				conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityWarning, "etcd members do not have the same member-list view")
 				response[name] = errors.Errorf("etcd member reports members IDs %v, but all previously seen etcd members reported member IDs %v", memberIDSet.UnsortedList(), knownMemberIDSet.UnsortedList())
 			}
 			continue
@@ -149,7 +149,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		defer etcdClient.Close()
 		alarmList, err := etcdClient.Alarms(ctx)
 		if len(alarmList) > 0 || err != nil {
-			conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd cluster has alarms.")
+			conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityWarning, "etcd cluster has alarms.")
 			return response, errors.Errorf("etcd cluster has %d alarms", len(alarmList))
 		}
 	}
@@ -159,13 +159,13 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 	// Check that there is exactly one etcd member for every healthy pod.
 	// This allows us to handle the expected case where there is a failing pod but it's been removed from the member list.
 	if expectedMembers != len(knownMemberIDSet) {
-		conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd pods does not match with etcd members.")
+		conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityWarning, "etcd pods does not match with etcd members.")
 		return response, errors.Errorf("there are %d healthy etcd pods, but %d etcd members", expectedMembers, len(knownMemberIDSet))
 	}
 
 	// TODO: During provisioning, this condition may be set false for a short time until all pods are provisioned, can add additional checks here to prevent this.
 	if len(knownMemberIDSet) < (controlPlane.Machines.Len()/2.0 + 1) {
-		conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, ConditionReason(controlplanev1.EtcdClusterUnhealthyReason).GetSeverity(), "etcd cluster's quorum is lost.")
+		conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityWarning, "etcd cluster's quorum is lost.")
 		return response, errors.Errorf("etcd lost its quorum: there are %d control-plane machines, but %d etcd members", controlPlane.Machines.Len(), len(knownMemberIDSet))
 	}
 	return response, nil

From f984dba42469febabee9c48e8752e0f3f9e1eb96 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Fri, 9 Oct 2020 03:23:02 -0700
Subject: [PATCH 08/12] Add health checks to reconcileDelete

---
 .../kubeadm/controllers/controller.go         | 52 ++++++++++---
 .../kubeadm/controllers/fakes_test.go         | 13 ++++
 controlplane/kubeadm/internal/cluster.go      | 36 ++++++---
 controlplane/kubeadm/internal/cluster_test.go | 78 ++++++-------------
 4 files changed, 104 insertions(+), 75 deletions(-)

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index 7c2e6dfa136f..099baa3ac952 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -290,21 +290,19 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 		return ctrl.Result{}, err
 	}
 
-	ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp))
-	if len(ownedMachines) != len(controlPlaneMachines) {
-		logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode")
-		return ctrl.Result{}, nil
-	}
-
-	controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines)
-	if err != nil {
+	controlPlane, err := r.createControlPlane(ctx, cluster, kcp)
+	if controlPlane == nil || err != nil {
 		logger.Error(err, "failed to initialize control plane")
 		return ctrl.Result{}, err
 	}
+	if len(controlPlane.Machines) != len(controlPlaneMachines) {
+		logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode")
+		return ctrl.Result{}, nil
+	}
 
 	// Aggregate the operational state of all the machines; while aggregating we are adding the
 	// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
-	conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef())
+	conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, controlPlane.Machines.ConditionGetters(), conditions.AddSourceRef())
 
 	// reconcileControlPlaneHealth returns err if there is a machine being delete or control plane is unhealthy.
 	// If control plane is not initialized, then control-plane machines will be empty and hence health check will not fail.
@@ -331,7 +329,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 	}
 
 	// If we've made it this far, we can assume that all ownedMachines are up to date
-	numMachines := len(ownedMachines)
+	numMachines := len(controlPlane.Machines)
 	desiredReplicas := int(*kcp.Spec.Replicas)
 
 	switch {
@@ -379,6 +377,24 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 	return ctrl.Result{}, nil
 }
 
+func (r *KubeadmControlPlaneReconciler) createControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (*internal.ControlPlane, error) {
+	logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name)
+
+	controlPlaneMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster), machinefilters.ControlPlaneMachines(cluster.Name))
+	if err != nil {
+		logger.Error(err, "failed to retrieve control plane machines for cluster")
+		return nil, err
+	}
+	ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp))
+
+	controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines)
+	if err != nil {
+		logger.Error(err, "failed to initialize control plane")
+		return nil, err
+	}
+	return controlPlane, nil
+}
+
 // reconcileDelete handles KubeadmControlPlane deletion.
 // The implementation does not take non-control plane workloads into consideration. This may or may not change in the future.
 // Please see https://github.com/kubernetes-sigs/cluster-api/issues/2064.
@@ -386,6 +402,22 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
 	logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name)
 	logger.Info("Reconcile KubeadmControlPlane deletion")
 
+	controlPlane, err := r.createControlPlane(ctx, cluster, kcp)
+	if controlPlane == nil || err != nil {
+		logger.Error(err, "failed to initialize control plane")
+		return ctrl.Result{}, err
+	}
+
+	// Ignore the health check results here, they are used to set health related conditions on Machines.
+	_, err = r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
+	if err != nil {
+		return ctrl.Result{}, err
+	}
+	_, err = r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
+	if err != nil {
+		return ctrl.Result{}, err
+	}
+
 	allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster))
 	if err != nil {
 		return ctrl.Result{}, err
diff --git a/controlplane/kubeadm/controllers/fakes_test.go b/controlplane/kubeadm/controllers/fakes_test.go
index fce1de771386..60f182dd2284 100644
--- a/controlplane/kubeadm/controllers/fakes_test.go
+++ b/controlplane/kubeadm/controllers/fakes_test.go
@@ -70,6 +70,19 @@ func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _
 	}
 	return nil
 }
+func (f *fakeManagementCluster) TargetClusterEtcdHealthCheck(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) (internal.HealthCheckResult, error) {
+	if !f.EtcdHealthy {
+		return nil, errors.New("etcd is not healthy")
+	}
+	return nil, nil
+}
+
+func (f *fakeManagementCluster) TargetClusterControlPlaneHealthCheck(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) (internal.HealthCheckResult, error) {
+	if !f.ControlPlaneHealthy {
+		return nil, errors.New("control plane is not healthy")
+	}
+	return nil, nil
+}
 
 type fakeWorkloadCluster struct {
 	*internal.Workload
diff --git a/controlplane/kubeadm/internal/cluster.go b/controlplane/kubeadm/internal/cluster.go
index 363e852411c5..53711331f82a 100644
--- a/controlplane/kubeadm/internal/cluster.go
+++ b/controlplane/kubeadm/internal/cluster.go
@@ -41,6 +41,8 @@ type ManagementCluster interface {
 	ctrlclient.Reader
 
 	GetMachinesForCluster(ctx context.Context, cluster client.ObjectKey, filters ...machinefilters.Func) (FilterableMachineCollection, error)
+	TargetClusterControlPlaneHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error)
+	TargetClusterEtcdHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error)
 	TargetClusterEtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error
 	TargetClusterControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error
 	GetWorkloadCluster(ctx context.Context, clusterKey client.ObjectKey) (WorkloadCluster, error)
@@ -179,21 +181,14 @@ func (m *Management) getApiServerEtcdClientCert(ctx context.Context, clusterKey
 	return tls.X509KeyPair(crtData, keyData)
 }
 
-type healthCheck func(context.Context, *ControlPlane) (HealthCheckResult, error)
-
 // HealthCheck will run a generic health check function and report any errors discovered.
 // In addition to the health check, it also ensures there is a 1;1 match between nodes and machines.
 // To have access to the owned control-plane machines during health checks, need to pass owningMachines here.
-func (m *Management) healthCheck(ctx context.Context, controlPlane *ControlPlane, check healthCheck, clusterKey client.ObjectKey) error {
+func (m *Management) healthCheck(controlPlane *ControlPlane, nodeChecks HealthCheckResult, clusterKey client.ObjectKey) error {
 	var errorList []error
 	kcpMachines := controlPlane.Machines.UnsortedList()
 	// Make sure Cluster API is aware of all the nodes.
 
-	nodeChecks, err := check(ctx, controlPlane)
-	if err != nil {
-		errorList = append(errorList, err)
-	}
-
 	// TODO: If any node has a failure, healthCheck fails. This may be too strict for the health check of HA clusters (Missing a single etcd pod does not indicate a problem)...
 	for nodeName, err := range nodeChecks {
 		if err != nil {
@@ -222,22 +217,39 @@ func (m *Management) healthCheck(ctx context.Context, controlPlane *ControlPlane
 	return nil
 }
 
+func (m *Management) TargetClusterControlPlaneHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) {
+	workloadCluster, err := m.GetWorkloadCluster(ctx, clusterKey)
+	if err != nil {
+		return nil, err
+	}
+	return workloadCluster.ControlPlaneIsHealthy(ctx, controlPlane)
+}
+
+func (m *Management) TargetClusterEtcdHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) {
+	workloadCluster, err := m.GetWorkloadCluster(ctx, clusterKey)
+	if err != nil {
+		return nil, err
+	}
+	return workloadCluster.EtcdIsHealthy(ctx, controlPlane)
+}
+
 // TargetClusterControlPlaneIsHealthy checks every node for control plane health.
 func (m *Management) TargetClusterControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error {
 	// TODO: add checks for expected taints/labels
-	cluster, err := m.GetWorkloadCluster(ctx, clusterKey)
+
+	checkResult, err := m.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, clusterKey)
 	if err != nil {
 		return err
 	}
-	return m.healthCheck(ctx, controlPlane, cluster.ControlPlaneIsHealthy, clusterKey)
+	return m.healthCheck(controlPlane, checkResult, clusterKey)
 }
 
 // TargetClusterEtcdIsHealthy runs a series of checks over a target cluster's etcd cluster.
 // In addition, it verifies that there are the same number of etcd members as control plane Machines.
 func (m *Management) TargetClusterEtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error {
-	cluster, err := m.GetWorkloadCluster(ctx, clusterKey)
+	checkResult, err := m.TargetClusterEtcdHealthCheck(ctx, controlPlane, clusterKey)
 	if err != nil {
 		return err
 	}
-	return m.healthCheck(ctx, controlPlane, cluster.EtcdIsHealthy, clusterKey)
+	return m.healthCheck(controlPlane, checkResult, clusterKey)
 }
diff --git a/controlplane/kubeadm/internal/cluster_test.go b/controlplane/kubeadm/internal/cluster_test.go
index f4193244ac5f..c9cc1f436ce7 100644
--- a/controlplane/kubeadm/internal/cluster_test.go
+++ b/controlplane/kubeadm/internal/cluster_test.go
@@ -25,9 +25,6 @@ import (
 	"errors"
 	"fmt"
 	"math/big"
-	"sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
-	"sigs.k8s.io/cluster-api/util/kubeconfig"
-	"sigs.k8s.io/cluster-api/util/secret"
 	"testing"
 	"time"
 
@@ -41,8 +38,11 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
+	"sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters"
 	"sigs.k8s.io/cluster-api/util/certs"
+	"sigs.k8s.io/cluster-api/util/kubeconfig"
+	"sigs.k8s.io/cluster-api/util/secret"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -476,19 +476,17 @@ func TestManagementCluster_healthCheck_NoError(t *testing.T) {
 	controlPlane := createControlPlane(threeMachines)
 	tests := []struct {
 		name             string
-		check            healthCheck
+		checkResult      HealthCheckResult
 		clusterKey       client.ObjectKey
 		controlPlaneName string
 		controlPlane     *ControlPlane
 	}{
 		{
 			name: "simple",
-			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one":   nil,
-					"two":   nil,
-					"three": nil,
-				}, nil
+			checkResult: HealthCheckResult{
+				"one":   nil,
+				"two":   nil,
+				"three": nil,
 			},
 			clusterKey:   client.ObjectKey{Namespace: "default", Name: "cluster-name"},
 			controlPlane: controlPlane,
@@ -499,11 +497,10 @@ func TestManagementCluster_healthCheck_NoError(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			g := NewWithT(t)
 
-			ctx := context.Background()
 			m := &Management{
 				Client: &fakeClient{},
 			}
-			g.Expect(m.healthCheck(ctx, controlPlane, tt.check, tt.clusterKey)).To(Succeed())
+			g.Expect(m.healthCheck(controlPlane, tt.checkResult, tt.clusterKey)).To(Succeed())
 		})
 	}
 }
@@ -512,7 +509,7 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 
 	tests := []struct {
 		name             string
-		check            healthCheck
+		checkResult      HealthCheckResult
 		clusterKey       client.ObjectKey
 		controlPlaneName string
 		controlPlane     *ControlPlane
@@ -527,27 +524,9 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 				controlPlaneMachine("two"),
 				controlPlaneMachine("three"),
 			}),
-			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one": nil,
-				}, nil
-			},
-		},
-		{
-			name: "health check returns an error not related to the nodes health",
-			controlPlane: createControlPlane([]*clusterv1.Machine{
-				controlPlaneMachine("one"),
-				controlPlaneMachine("two"),
-				controlPlaneMachine("three"),
-			}),
-			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one":   nil,
-					"two":   errors.New("two"),
-					"three": errors.New("three"),
-				}, errors.New("meta")
+			checkResult: HealthCheckResult{
+				"one": nil,
 			},
-			expectedErrors: []string{"two", "three", "meta"},
 		},
 		{
 			name: "two nodes error on the check but no overall error occurred",
@@ -555,12 +534,10 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 				controlPlaneMachine("one"),
 				controlPlaneMachine("two"),
 				controlPlaneMachine("three")}),
-			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one":   nil,
-					"two":   errors.New("two"),
-					"three": errors.New("three"),
-				}, nil
+			checkResult: HealthCheckResult{
+				"one":   nil,
+				"two":   errors.New("two"),
+				"three": errors.New("three"),
 			},
 			expectedErrors: []string{"two", "three"},
 		},
@@ -568,12 +545,10 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 			name: "more nodes than machines were checked (out of band control plane nodes)",
 			controlPlane: createControlPlane([]*clusterv1.Machine{
 				controlPlaneMachine("one")}),
-			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one":   nil,
-					"two":   nil,
-					"three": nil,
-				}, nil
+			checkResult: HealthCheckResult{
+				"one":   nil,
+				"two":   nil,
+				"three": nil,
 			},
 		},
 		{
@@ -582,12 +557,10 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 				controlPlaneMachine("one"),
 				controlPlaneMachine("two"),
 				nilNodeRef(controlPlaneMachine("three"))}),
-			check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
-				return HealthCheckResult{
-					"one":   nil,
-					"two":   nil,
-					"three": nil,
-				}, nil
+			checkResult: HealthCheckResult{
+				"one":   nil,
+				"two":   nil,
+				"three": nil,
 			},
 		},
 	}
@@ -595,11 +568,10 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			g := NewWithT(t)
 
-			ctx := context.Background()
 			clusterKey := client.ObjectKey{Namespace: "default", Name: "cluster-name"}
 
 			m := &Management{Client: &fakeClient{}}
-			err := m.healthCheck(ctx, tt.controlPlane, tt.check, clusterKey)
+			err := m.healthCheck(tt.controlPlane, tt.checkResult, clusterKey)
 			g.Expect(err).To(HaveOccurred())
 
 			for _, expectedError := range tt.expectedErrors {

From 800b8dd9073a4f38ad8aa324661571d606a37376 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Sun, 11 Oct 2020 21:52:57 -0700
Subject: [PATCH 09/12] Disregard health check errors during reconcileDelete

---
 controlplane/kubeadm/controllers/controller.go | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index 099baa3ac952..04a700531001 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -408,15 +408,10 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
 		return ctrl.Result{}, err
 	}
 
-	// Ignore the health check results here, they are used to set health related conditions on Machines.
-	_, err = r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
-	if err != nil {
-		return ctrl.Result{}, err
-	}
-	_, err = r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
-	if err != nil {
-		return ctrl.Result{}, err
-	}
+	// Ignore the health check results here as well as the errors, they are used to set health related conditions on Machines.
+	// Errors may be dues not being able to get workload cluster nodes.
+	r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
+	r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
 
 	allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster))
 	if err != nil {

From b52b9b7bce27e7943d4d63f593e368b240d49fd8 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Mon, 12 Oct 2020 00:18:55 -0700
Subject: [PATCH 10/12] Add quorum check and ignore health errors during
 reconcileDelete

---
 .../kubeadm/controllers/controller.go         |  4 +-
 controlplane/kubeadm/internal/etcd/etcd.go    | 22 +++++++-
 .../kubeadm/internal/etcd/etcd_test.go        |  4 +-
 .../kubeadm/internal/workload_cluster_etcd.go | 51 ++++++++++++++-----
 4 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index 04a700531001..d85bb608f5cb 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -410,8 +410,8 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
 
 	// Ignore the health check results here as well as the errors, they are used to set health related conditions on Machines.
 	// Errors may be dues not being able to get workload cluster nodes.
-	r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
-	r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
+	r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) //nolint
+	r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) //nolint
 
 	allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster))
 	if err != nil {
diff --git a/controlplane/kubeadm/internal/etcd/etcd.go b/controlplane/kubeadm/internal/etcd/etcd.go
index a71fbb898de7..0192500067ce 100644
--- a/controlplane/kubeadm/internal/etcd/etcd.go
+++ b/controlplane/kubeadm/internal/etcd/etcd.go
@@ -24,6 +24,7 @@ import (
 
 	"github.com/pkg/errors"
 	"go.etcd.io/etcd/clientv3"
+	"go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes"
 	"go.etcd.io/etcd/etcdserver/etcdserverpb"
 	"google.golang.org/grpc"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/proxy"
@@ -53,6 +54,7 @@ type Client struct {
 	EtcdClient etcd
 	Endpoint   string
 	LeaderID   uint64
+	KVClient   *clientv3.Client
 }
 
 // MemberAlarm represents an alarm type association with a cluster member.
@@ -132,14 +134,15 @@ func NewClient(ctx context.Context, endpoints []string, p proxy.Proxy, tlsConfig
 		},
 		TLS: tlsConfig,
 	})
+
 	if err != nil {
 		return nil, errors.Wrap(err, "unable to create etcd client")
 	}
 
-	return newEtcdClient(ctx, etcdClient)
+	return newEtcdClient(ctx, etcdClient, etcdClient)
 }
 
-func newEtcdClient(ctx context.Context, etcdClient etcd) (*Client, error) {
+func newEtcdClient(ctx context.Context, etcdClient etcd, kvClient *clientv3.Client) (*Client, error) {
 	endpoints := etcdClient.Endpoints()
 	if len(endpoints) == 0 {
 		return nil, errors.New("etcd client was not configured with any endpoints")
@@ -154,6 +157,7 @@ func newEtcdClient(ctx context.Context, etcdClient etcd) (*Client, error) {
 		Endpoint:   endpoints[0],
 		EtcdClient: etcdClient,
 		LeaderID:   status.Leader,
+		KVClient:   kvClient,
 	}, nil
 }
 
@@ -234,3 +238,17 @@ func (c *Client) Alarms(ctx context.Context) ([]MemberAlarm, error) {
 
 	return memberAlarms, nil
 }
+
+// HealthCheck checks the healthiness of endpoints specified in endpoints during Client creation.
+// Using the same logic used in etcdctl health command.
+func (c *Client) HealthCheck(ctx context.Context) error {
+	// This check is return nil if FakeEtcdClient.
+	if c.KVClient == nil {
+		return nil
+	}
+	_, err := c.KVClient.Get(ctx, "health")
+	if err == nil || err == rpctypes.ErrPermissionDenied {
+		return nil
+	}
+	return err
+}
diff --git a/controlplane/kubeadm/internal/etcd/etcd_test.go b/controlplane/kubeadm/internal/etcd/etcd_test.go
index 1d7f99abcaec..d6f6d3cfc145 100644
--- a/controlplane/kubeadm/internal/etcd/etcd_test.go
+++ b/controlplane/kubeadm/internal/etcd/etcd_test.go
@@ -47,7 +47,7 @@ func TestEtcdMembers_WithErrors(t *testing.T) {
 		ErrorResponse:        errors.New("something went wrong"),
 	}
 
-	client, err := newEtcdClient(ctx, fakeEtcdClient)
+	client, err := newEtcdClient(ctx, fakeEtcdClient, nil)
 	g.Expect(err).NotTo(HaveOccurred())
 
 	members, err := client.Members(ctx)
@@ -86,7 +86,7 @@ func TestEtcdMembers_WithSuccess(t *testing.T) {
 		StatusResponse:       &clientv3.StatusResponse{},
 	}
 
-	client, err := newEtcdClient(ctx, fakeEtcdClient)
+	client, err := newEtcdClient(ctx, fakeEtcdClient, nil)
 	g.Expect(err).NotTo(HaveOccurred())
 
 	members, err := client.Members(ctx)
diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go
index 11e153cbef3d..2af5451adf78 100644
--- a/controlplane/kubeadm/internal/workload_cluster_etcd.go
+++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go
@@ -99,6 +99,11 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		}
 		defer etcdClient.Close()
 
+		err = etcdClient.HealthCheck(ctx)
+		if err != nil {
+			response[name] = errors.Wrap(err, "etcd member is unhealthy")
+			continue
+		}
 		// List etcd members. This checks that the member is healthy, because the request goes through consensus.
 		members, err := etcdClient.Members(ctx)
 		if err != nil {
@@ -143,17 +148,6 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		}
 	}
 
-	// Check etcd cluster alarms
-	etcdClient, err := w.etcdClientGenerator.forNodes(ctx, controlPlaneNodes.Items)
-	if err == nil {
-		defer etcdClient.Close()
-		alarmList, err := etcdClient.Alarms(ctx)
-		if len(alarmList) > 0 || err != nil {
-			conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityWarning, "etcd cluster has alarms.")
-			return response, errors.Errorf("etcd cluster has %d alarms", len(alarmList))
-		}
-	}
-
 	// TODO: ensure that each pod is owned by a node that we're managing. That would ensure there are no out-of-band etcd members
 
 	// Check that there is exactly one etcd member for every healthy pod.
@@ -163,11 +157,44 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		return response, errors.Errorf("there are %d healthy etcd pods, but %d etcd members", expectedMembers, len(knownMemberIDSet))
 	}
 
+	// Check etcd cluster alarms
+	etcdClient, err := w.etcdClientGenerator.forNodes(ctx, controlPlaneNodes.Items)
+	if err != nil {
+		conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityWarning, "failed to get etcd client.")
+		return response, err
+	}
+
+	defer etcdClient.Close()
+	alarmList, err := etcdClient.Alarms(ctx)
+	if len(alarmList) > 0 || err != nil {
+		conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityWarning, "etcd cluster has alarms.")
+		return response, errors.Errorf("etcd cluster has %d alarms", len(alarmList))
+	}
+
+	members, err := etcdClient.Members(ctx)
+	if err != nil {
+		conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityWarning, "failed to get etcd members.")
+		return response, err
+	}
+
+	healthyMembers := 0
+	for _, m := range members {
+		if val, ok := response[m.Name]; ok {
+			if val == nil {
+				healthyMembers++
+			}
+		} else {
+			// There are members in etcd cluster that is not part of controlplane nodes.
+			conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityWarning, "unknown etcd member that is not part of control plane nodes.")
+			return response, err
+		}
+	}
 	// TODO: During provisioning, this condition may be set false for a short time until all pods are provisioned, can add additional checks here to prevent this.
-	if len(knownMemberIDSet) < (controlPlane.Machines.Len()/2.0 + 1) {
+	if healthyMembers < (len(members)/2 + 1) {
 		conditions.MarkFalse(controlPlane.KCP, controlplanev1.EtcdClusterHealthy, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityWarning, "etcd cluster's quorum is lost.")
 		return response, errors.Errorf("etcd lost its quorum: there are %d control-plane machines, but %d etcd members", controlPlane.Machines.Len(), len(knownMemberIDSet))
 	}
+
 	return response, nil
 }
 

From 846573d0fc0b938e69d88516c25ccbb9477af534 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Thu, 15 Oct 2020 11:00:56 -0700
Subject: [PATCH 11/12] Minor fixes

---
 controlplane/kubeadm/controllers/controller.go         | 5 +++--
 controlplane/kubeadm/internal/workload_cluster_etcd.go | 3 +--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index d85bb608f5cb..19ad20f15acc 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -291,7 +291,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 	}
 
 	controlPlane, err := r.createControlPlane(ctx, cluster, kcp)
-	if controlPlane == nil || err != nil {
+	if err != nil {
 		logger.Error(err, "failed to initialize control plane")
 		return ctrl.Result{}, err
 	}
@@ -403,7 +403,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
 	logger.Info("Reconcile KubeadmControlPlane deletion")
 
 	controlPlane, err := r.createControlPlane(ctx, cluster, kcp)
-	if controlPlane == nil || err != nil {
+	if err != nil {
 		logger.Error(err, "failed to initialize control plane")
 		return ctrl.Result{}, err
 	}
@@ -413,6 +413,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
 	r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) //nolint
 	r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) //nolint
 
+	// Gets all machines, not just control plane machines.
 	allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster))
 	if err != nil {
 		return ctrl.Result{}, err
diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go
index 2af5451adf78..688b8a2abbc6 100644
--- a/controlplane/kubeadm/internal/workload_cluster_etcd.go
+++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go
@@ -99,8 +99,7 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane
 		}
 		defer etcdClient.Close()
 
-		err = etcdClient.HealthCheck(ctx)
-		if err != nil {
+		if err := etcdClient.HealthCheck(ctx); err != nil {
 			response[name] = errors.Wrap(err, "etcd member is unhealthy")
 			continue
 		}

From c5126765206e0ba594b88553e7b11a999019e5f3 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Thu, 15 Oct 2020 11:41:14 -0700
Subject: [PATCH 12/12] Add fake KV etcd client

---
 .../kubeadm/controllers/controller.go         | 16 +++++++++---
 controlplane/kubeadm/internal/etcd/etcd.go    |  8 ++----
 .../kubeadm/internal/etcd/etcd_test.go        |  6 ++++-
 .../kubeadm/internal/etcd/fake/client.go      | 25 +++++++++++++++++++
 4 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index 19ad20f15acc..7140ffcb384b 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -408,10 +408,18 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
 		return ctrl.Result{}, err
 	}
 
-	// Ignore the health check results here as well as the errors, they are used to set health related conditions on Machines.
-	// Errors may be dues not being able to get workload cluster nodes.
-	r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) //nolint
-	r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) //nolint
+	// Ignore the health check results here as well as the errors, health check functions are to set health related conditions on Machines.
+	// Errors may be due to not being able to get workload cluster nodes.
+	_, err = r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
+	if err != nil {
+		// Do nothing
+		r.Log.Info("Control plane did not pass control plane health check during delete reconciliation", "err", err.Error())
+	}
+	_, err = r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
+	if err != nil {
+		// Do nothing
+		r.Log.Info("Control plane did not pass etcd health check during delete reconciliation", "err", err.Error())
+	}
 
 	// Gets all machines, not just control plane machines.
 	allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster))
diff --git a/controlplane/kubeadm/internal/etcd/etcd.go b/controlplane/kubeadm/internal/etcd/etcd.go
index 0192500067ce..8a75d66d1fd0 100644
--- a/controlplane/kubeadm/internal/etcd/etcd.go
+++ b/controlplane/kubeadm/internal/etcd/etcd.go
@@ -54,7 +54,7 @@ type Client struct {
 	EtcdClient etcd
 	Endpoint   string
 	LeaderID   uint64
-	KVClient   *clientv3.Client
+	KVClient   clientv3.KV
 }
 
 // MemberAlarm represents an alarm type association with a cluster member.
@@ -142,7 +142,7 @@ func NewClient(ctx context.Context, endpoints []string, p proxy.Proxy, tlsConfig
 	return newEtcdClient(ctx, etcdClient, etcdClient)
 }
 
-func newEtcdClient(ctx context.Context, etcdClient etcd, kvClient *clientv3.Client) (*Client, error) {
+func newEtcdClient(ctx context.Context, etcdClient etcd, kvClient clientv3.KV) (*Client, error) {
 	endpoints := etcdClient.Endpoints()
 	if len(endpoints) == 0 {
 		return nil, errors.New("etcd client was not configured with any endpoints")
@@ -242,10 +242,6 @@ func (c *Client) Alarms(ctx context.Context) ([]MemberAlarm, error) {
 // HealthCheck checks the healthiness of endpoints specified in endpoints during Client creation.
 // Using the same logic used in etcdctl health command.
 func (c *Client) HealthCheck(ctx context.Context) error {
-	// This check is return nil if FakeEtcdClient.
-	if c.KVClient == nil {
-		return nil
-	}
 	_, err := c.KVClient.Get(ctx, "health")
 	if err == nil || err == rpctypes.ErrPermissionDenied {
 		return nil
diff --git a/controlplane/kubeadm/internal/etcd/etcd_test.go b/controlplane/kubeadm/internal/etcd/etcd_test.go
index d6f6d3cfc145..487846df6dec 100644
--- a/controlplane/kubeadm/internal/etcd/etcd_test.go
+++ b/controlplane/kubeadm/internal/etcd/etcd_test.go
@@ -86,7 +86,11 @@ func TestEtcdMembers_WithSuccess(t *testing.T) {
 		StatusResponse:       &clientv3.StatusResponse{},
 	}
 
-	client, err := newEtcdClient(ctx, fakeEtcdClient, nil)
+	fakeKVClient := &etcdfake.FakeKVClient{
+		GetResponse: &clientv3.GetResponse{},
+	}
+
+	client, err := newEtcdClient(ctx, fakeEtcdClient, fakeKVClient)
 	g.Expect(err).NotTo(HaveOccurred())
 
 	members, err := client.Members(ctx)
diff --git a/controlplane/kubeadm/internal/etcd/fake/client.go b/controlplane/kubeadm/internal/etcd/fake/client.go
index 98ce83c9a2bb..1ec75d8446ae 100644
--- a/controlplane/kubeadm/internal/etcd/fake/client.go
+++ b/controlplane/kubeadm/internal/etcd/fake/client.go
@@ -22,6 +22,31 @@ import (
 	"go.etcd.io/etcd/clientv3"
 )
 
+type FakeKVClient struct {
+	GetResponse *clientv3.GetResponse
+}
+
+func (kv *FakeKVClient) Put(ctx context.Context, key, val string, opts ...clientv3.OpOption) (*clientv3.PutResponse, error) {
+	return nil, nil
+}
+func (kv *FakeKVClient) Get(ctx context.Context, key string, opts ...clientv3.OpOption) (*clientv3.GetResponse, error) {
+	return kv.GetResponse, nil
+}
+func (kv *FakeKVClient) Delete(ctx context.Context, key string, opts ...clientv3.OpOption) (*clientv3.DeleteResponse, error) {
+	return nil, nil
+}
+func (kv *FakeKVClient) Compact(ctx context.Context, rev int64, opts ...clientv3.CompactOption) (*clientv3.CompactResponse, error) {
+	return nil, nil
+}
+
+func (kv *FakeKVClient) Do(ctx context.Context, op clientv3.Op) (clientv3.OpResponse, error) {
+	return clientv3.OpResponse{}, nil
+}
+
+func (kv *FakeKVClient) Txn(ctx context.Context) clientv3.Txn {
+	return nil
+}
+
 type FakeEtcdClient struct {
 	AlarmResponse        *clientv3.AlarmResponse
 	EtcdEndpoints        []string