From c97f63a06fdb8bc5e3c794dcf65dab982de7b5da Mon Sep 17 00:00:00 2001
From: Ben Moss <benm@vmware.com>
Date: Wed, 24 Jun 2020 16:20:05 +0000
Subject: [PATCH] Add MHC remediation to KCP

---
 .../kubeadm/controllers/controller.go         |  14 ++-
 controlplane/kubeadm/controllers/scale.go     |  30 ++----
 .../kubeadm/controllers/scale_test.go         |  68 +-----------
 controlplane/kubeadm/controllers/upgrade.go   |  13 +++
 .../kubeadm/controllers/upgrade_test.go       |   9 --
 .../kubeadm/internal/control_plane.go         |  27 ++++-
 .../kubeadm/internal/control_plane_test.go    | 100 ++++++++++++++++++
 .../kubeadm/internal/machine_collection.go    |  10 ++
 .../machinefilters/machine_filters.go         |  17 +++
 9 files changed, 187 insertions(+), 101 deletions(-)

diff --git a/controlplane/kubeadm/controllers/controller.go b/controlplane/kubeadm/controllers/controller.go
index 0d4d7751b011..58eace2548ac 100644
--- a/controlplane/kubeadm/controllers/controller.go
+++ b/controlplane/kubeadm/controllers/controller.go
@@ -279,6 +279,10 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 
 	controlPlane := internal.NewControlPlane(cluster, kcp, ownedMachines)
 
+	if controlPlane.HasDeletingMachine() || controlPlane.ProvisioningMachines().Len() > 0 {
+		return ctrl.Result{}, nil
+	}
+
 	// Aggregate the operational state of all the machines; while aggregating we are adding the
 	// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
 	conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef())
@@ -305,6 +309,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 	numMachines := len(ownedMachines)
 	desiredReplicas := int(*kcp.Spec.Replicas)
 
+	if numMachines > 0 && controlPlane.UnhealthyMachines().Len() == 0 {
+		if err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil {
+			return ctrl.Result{}, err
+		}
+	}
+
 	switch {
 	// We are creating the first replica
 	case numMachines < desiredReplicas && numMachines == 0:
@@ -317,8 +327,8 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 		// Create a new Machine w/ join
 		logger.Info("Scaling up control plane", "Desired", desiredReplicas, "Existing", numMachines)
 		return r.scaleUpControlPlane(ctx, cluster, kcp, controlPlane)
-	// We are scaling down
-	case numMachines > desiredReplicas:
+		// We are scaling down
+	case numMachines > desiredReplicas || controlPlane.UnhealthyMachines().Len() > 0:
 		logger.Info("Scaling down control plane", "Desired", desiredReplicas, "Existing", numMachines)
 		return r.scaleDownControlPlane(ctx, cluster, kcp, controlPlane)
 	}
diff --git a/controlplane/kubeadm/controllers/scale.go b/controlplane/kubeadm/controllers/scale.go
index cedc48bc6b6c..d7b09b16feaf 100644
--- a/controlplane/kubeadm/controllers/scale.go
+++ b/controlplane/kubeadm/controllers/scale.go
@@ -26,7 +26,6 @@ import (
 	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters"
-	capierrors "sigs.k8s.io/cluster-api/errors"
 	"sigs.k8s.io/cluster-api/util"
 	ctrl "sigs.k8s.io/controller-runtime"
 )
@@ -63,11 +62,6 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte
 func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
 	logger := controlPlane.Logger()
 
-	// reconcileHealth returns err if there is a machine being delete which is a required condition to check before scaling up
-	if err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil {
-		return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
-	}
-
 	// Create the bootstrap configuration
 	bootstrapSpec := controlPlane.JoinControlPlaneConfig()
 	fd := controlPlane.FailureDomainWithFewestMachines()
@@ -89,10 +83,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
 ) (ctrl.Result, error) {
 	logger := controlPlane.Logger()
 
-	if err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil {
-		return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
-	}
-
 	workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
 	if err != nil {
 		logger.Error(err, "Failed to create client to workload cluster")
@@ -109,6 +99,11 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
 		return ctrl.Result{}, errors.New("failed to pick control plane Machine to delete")
 	}
 
+	if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToDelete); err != nil {
+		logger.Error(err, "Failed to remove machine from kubeadm ConfigMap")
+		return ctrl.Result{}, err
+	}
+
 	// If etcd leadership is on machine that is about to be deleted, move it to the newest member available.
 	etcdLeaderCandidate := controlPlane.Machines.Newest()
 	if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToDelete, etcdLeaderCandidate); err != nil {
@@ -120,18 +115,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
 		return ctrl.Result{}, err
 	}
 
-	if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
-		logger.V(2).Info("Waiting for control plane to pass control plane health check before removing a control plane machine", "cause", err)
-		r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
-			"Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err)
-		return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
-
-	}
-	if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToDelete); err != nil {
-		logger.Error(err, "Failed to remove machine from kubeadm ConfigMap")
-		return ctrl.Result{}, err
-	}
-
 	logger = logger.WithValues("machine", machineToDelete)
 	if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) {
 		logger.Error(err, "Failed to delete control plane machine")
@@ -149,5 +132,8 @@ func selectMachineForScaleDown(controlPlane *internal.ControlPlane) (*clusterv1.
 	if needingUpgrade := controlPlane.MachinesNeedingRollout(); needingUpgrade.Len() > 0 {
 		machines = needingUpgrade
 	}
+	if unhealthy := controlPlane.UnhealthyMachines(); unhealthy.Len() > 0 {
+		machines = unhealthy
+	}
 	return controlPlane.MachineInFailureDomainWithMostMachines(machines)
 }
diff --git a/controlplane/kubeadm/controllers/scale_test.go b/controlplane/kubeadm/controllers/scale_test.go
index 42905d35663b..23e1a14bfb7b 100644
--- a/controlplane/kubeadm/controllers/scale_test.go
+++ b/controlplane/kubeadm/controllers/scale_test.go
@@ -33,7 +33,6 @@ import (
 	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/hash"
-	capierrors "sigs.k8s.io/cluster-api/errors"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/log"
@@ -84,7 +83,7 @@ func TestKubeadmControlPlaneReconciler_initializeControlPlane(t *testing.T) {
 }
 
 func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
-	t.Run("creates a control plane Machine if health checks pass", func(t *testing.T) {
+	t.Run("creates a control plane Machine", func(t *testing.T) {
 		g := NewWithT(t)
 
 		cluster, kcp, genericMachineTemplate := createClusterWithControlPlane()
@@ -125,71 +124,6 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
 		g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed())
 		g.Expect(controlPlaneMachines.Items).To(HaveLen(3))
 	})
-	t.Run("does not create a control plane Machine if health checks fail", func(t *testing.T) {
-		cluster, kcp, genericMachineTemplate := createClusterWithControlPlane()
-		initObjs := []runtime.Object{cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy()}
-
-		beforeMachines := internal.NewFilterableMachineCollection()
-		for i := 0; i < 2; i++ {
-			m, _ := createMachineNodePair(fmt.Sprintf("test-%d", i), cluster.DeepCopy(), kcp.DeepCopy(), true)
-			beforeMachines = beforeMachines.Insert(m)
-			initObjs = append(initObjs, m.DeepCopy())
-		}
-
-		testCases := []struct {
-			name                  string
-			etcdUnHealthy         bool
-			controlPlaneUnHealthy bool
-		}{
-			{
-				name:          "etcd health check fails",
-				etcdUnHealthy: true,
-			},
-			{
-				name:                  "controlplane component health check fails",
-				controlPlaneUnHealthy: true,
-			},
-		}
-		for _, tc := range testCases {
-			g := NewWithT(t)
-
-			fakeClient := newFakeClient(g, initObjs...)
-			fmc := &fakeManagementCluster{
-				Machines:            beforeMachines.DeepCopy(),
-				ControlPlaneHealthy: !tc.controlPlaneUnHealthy,
-				EtcdHealthy:         !tc.etcdUnHealthy,
-			}
-
-			r := &KubeadmControlPlaneReconciler{
-				Client:                    fakeClient,
-				managementCluster:         fmc,
-				managementClusterUncached: fmc,
-				Log:                       log.Log,
-				recorder:                  record.NewFakeRecorder(32),
-			}
-			controlPlane := &internal.ControlPlane{
-				KCP:      kcp,
-				Cluster:  cluster,
-				Machines: beforeMachines,
-			}
-
-			_, err := r.scaleUpControlPlane(context.Background(), cluster.DeepCopy(), kcp.DeepCopy(), controlPlane)
-			g.Expect(err).To(HaveOccurred())
-			g.Expect(err).To(MatchError(&capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}))
-
-			controlPlaneMachines := &clusterv1.MachineList{}
-			g.Expect(fakeClient.List(context.Background(), controlPlaneMachines)).To(Succeed())
-			g.Expect(controlPlaneMachines.Items).To(HaveLen(len(beforeMachines)))
-
-			endMachines := internal.NewFilterableMachineCollectionFromMachineList(controlPlaneMachines)
-			for _, m := range endMachines {
-				bm, ok := beforeMachines[m.Name]
-				bm.SetResourceVersion("1")
-				g.Expect(ok).To(BeTrue())
-				g.Expect(m).To(Equal(bm))
-			}
-		}
-	})
 }
 
 func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing.T) {
diff --git a/controlplane/kubeadm/controllers/upgrade.go b/controlplane/kubeadm/controllers/upgrade.go
index 8b53ff58f800..7430ea94a8ae 100644
--- a/controlplane/kubeadm/controllers/upgrade.go
+++ b/controlplane/kubeadm/controllers/upgrade.go
@@ -18,6 +18,7 @@ package controllers
 
 import (
 	"context"
+	"fmt"
 
 	"github.com/blang/semver"
 	"github.com/pkg/errors"
@@ -41,35 +42,42 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane(
 	workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
 	if err != nil {
 		logger.Error(err, "failed to get remote client for workload cluster", "cluster key", util.ObjectKey(cluster))
+		fmt.Println(0)
 		return ctrl.Result{}, err
 	}
 
 	parsedVersion, err := semver.ParseTolerant(kcp.Spec.Version)
 	if err != nil {
+		fmt.Println(1)
 		return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", kcp.Spec.Version)
 	}
 
 	if err := workloadCluster.ReconcileKubeletRBACRole(ctx, parsedVersion); err != nil {
+		fmt.Println(2)
 		return ctrl.Result{}, errors.Wrap(err, "failed to reconcile the remote kubelet RBAC role")
 	}
 
 	if err := workloadCluster.ReconcileKubeletRBACBinding(ctx, parsedVersion); err != nil {
+		fmt.Println(3)
 		return ctrl.Result{}, errors.Wrap(err, "failed to reconcile the remote kubelet RBAC binding")
 	}
 
 	// Ensure kubeadm cluster role  & bindings for v1.18+
 	// as per https://github.com/kubernetes/kubernetes/commit/b117a928a6c3f650931bdac02a41fca6680548c4
 	if err := workloadCluster.AllowBootstrapTokensToGetNodes(ctx); err != nil {
+		fmt.Println(4)
 		return ctrl.Result{}, errors.Wrap(err, "failed to set role and role binding for kubeadm")
 	}
 
 	if err := workloadCluster.UpdateKubernetesVersionInKubeadmConfigMap(ctx, parsedVersion); err != nil {
+		fmt.Println(5)
 		return ctrl.Result{}, errors.Wrap(err, "failed to update the kubernetes version in the kubeadm config map")
 	}
 
 	if kcp.Spec.KubeadmConfigSpec.ClusterConfiguration != nil {
 		imageRepository := kcp.Spec.KubeadmConfigSpec.ClusterConfiguration.ImageRepository
 		if err := workloadCluster.UpdateImageRepositoryInKubeadmConfigMap(ctx, imageRepository); err != nil {
+			fmt.Println(6)
 			return ctrl.Result{}, errors.Wrap(err, "failed to update the image repository in the kubeadm config map")
 		}
 	}
@@ -77,22 +85,27 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane(
 	if kcp.Spec.KubeadmConfigSpec.ClusterConfiguration != nil && kcp.Spec.KubeadmConfigSpec.ClusterConfiguration.Etcd.Local != nil {
 		meta := kcp.Spec.KubeadmConfigSpec.ClusterConfiguration.Etcd.Local.ImageMeta
 		if err := workloadCluster.UpdateEtcdVersionInKubeadmConfigMap(ctx, meta.ImageRepository, meta.ImageTag); err != nil {
+			fmt.Println(7)
 			return ctrl.Result{}, errors.Wrap(err, "failed to update the etcd version in the kubeadm config map")
 		}
 	}
 
 	if err := workloadCluster.UpdateKubeletConfigMap(ctx, parsedVersion); err != nil {
+		fmt.Println(8)
 		return ctrl.Result{}, errors.Wrap(err, "failed to upgrade kubelet config map")
 	}
 
 	status, err := workloadCluster.ClusterStatus(ctx)
 	if err != nil {
+		fmt.Println(9)
 		return ctrl.Result{}, err
 	}
 
 	if status.Nodes <= *kcp.Spec.Replicas {
 		// scaleUp ensures that we don't continue scaling up while waiting for Machines to have NodeRefs
+		fmt.Println(10)
 		return r.scaleUpControlPlane(ctx, cluster, kcp, controlPlane)
 	}
+	fmt.Println(11)
 	return r.scaleDownControlPlane(ctx, cluster, kcp, controlPlane)
 }
diff --git a/controlplane/kubeadm/controllers/upgrade_test.go b/controlplane/kubeadm/controllers/upgrade_test.go
index 7dbd859c046b..a2238cd4e138 100644
--- a/controlplane/kubeadm/controllers/upgrade_test.go
+++ b/controlplane/kubeadm/controllers/upgrade_test.go
@@ -27,7 +27,6 @@ import (
 	"k8s.io/utils/pointer"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
-	capierrors "sigs.k8s.io/cluster-api/errors"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/log"
@@ -87,14 +86,6 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) {
 	bothMachines := &clusterv1.MachineList{}
 	g.Expect(fakeClient.List(context.Background(), bothMachines, client.InNamespace(cluster.Namespace))).To(Succeed())
 	g.Expect(bothMachines.Items).To(HaveLen(2))
-
-	// run upgrade a second time, simulate that the node has not appeared yet but the machine exists
-	r.managementCluster.(*fakeManagementCluster).ControlPlaneHealthy = false
-	_, err = r.upgradeControlPlane(context.Background(), cluster, kcp, controlPlane)
-	g.Expect(err).To(Equal(&capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}))
-	g.Expect(fakeClient.List(context.Background(), bothMachines, client.InNamespace(cluster.Namespace))).To(Succeed())
-	g.Expect(bothMachines.Items).To(HaveLen(2))
-
 	controlPlane.Machines = internal.NewFilterableMachineCollectionFromMachineList(bothMachines)
 
 	// manually increase number of nodes, make control plane healthy again
diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go
index 405d62f278be..69db85836b00 100644
--- a/controlplane/kubeadm/internal/control_plane.go
+++ b/controlplane/kubeadm/internal/control_plane.go
@@ -17,8 +17,9 @@ limitations under the License.
 package internal
 
 import (
+	"errors"
+
 	"github.com/go-logr/logr"
-	"github.com/pkg/errors"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apiserver/pkg/storage/names"
@@ -29,6 +30,9 @@ import (
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters"
 )
 
+// MachineHealthCheck remediation is only supported on clusters with >= 3 machines to avoid disrupting etcd consensus
+const minimumClusterSizeForRemediation = 3
+
 // ControlPlane holds business logic around control planes.
 // It should never need to connect to a service, that responsibility lies outside of this struct.
 type ControlPlane struct {
@@ -222,3 +226,24 @@ func (c *ControlPlane) NeedsReplacementNode() bool {
 func (c *ControlPlane) HasDeletingMachine() bool {
 	return len(c.Machines.Filter(machinefilters.HasDeletionTimestamp)) > 0
 }
+
+// ProvisioningMachines returns machines that are still booting.  In the case
+// of 3 node or larger clusters, it excludes unhealthy machines.
+func (c *ControlPlane) ProvisioningMachines() FilterableMachineCollection {
+	machines := c.Machines.Filter(machinefilters.IsProvisioning).
+		Filter(machinefilters.Not(machinefilters.IsFailed))
+
+	if c.Machines.Len() < minimumClusterSizeForRemediation {
+		return machines
+	}
+	return machines.Filter(machinefilters.Not(machinefilters.NeedsRemediation))
+}
+
+// UnhealthyMachines returns the machines that need remediation.  If cluster
+// size is less than 3, will return an empty list.
+func (c *ControlPlane) UnhealthyMachines() FilterableMachineCollection {
+	if c.Machines.Len() < minimumClusterSizeForRemediation {
+		return nil
+	}
+	return c.Machines.Filter(machinefilters.NeedsRemediation)
+}
diff --git a/controlplane/kubeadm/internal/control_plane_test.go b/controlplane/kubeadm/internal/control_plane_test.go
index 17adcbfd62a3..8c46f0d6b6b9 100644
--- a/controlplane/kubeadm/internal/control_plane_test.go
+++ b/controlplane/kubeadm/internal/control_plane_test.go
@@ -29,6 +29,9 @@ import (
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
 	bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1alpha3"
 	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
+	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/hash"
+	capierrors "sigs.k8s.io/cluster-api/errors"
+	"sigs.k8s.io/cluster-api/util/conditions"
 )
 
 func TestControlPlane(t *testing.T) {
@@ -200,6 +203,61 @@ var _ = Describe("Control Plane", func() {
 
 })
 
+func TestMachinePhaseFilters(t *testing.T) {
+	testCases := []struct {
+		name string
+		test func(g *WithT, cp *ControlPlane)
+	}{
+		{
+			name: "machines without a node or infrastructure ready are provisioning",
+			test: func(g *WithT, cp *ControlPlane) {
+				cp.Machines = NewFilterableMachineCollection(
+					machine("1", withNodeRef(), withInfrastructureReady()),
+					machine("2"),
+					machine("3", withNodeRef(), withInfrastructureReady()),
+				)
+				g.Expect(cp.ProvisioningMachines().Names()).To(ConsistOf("2"))
+				g.Expect(cp.UnhealthyMachines().Names()).To(BeEmpty())
+			},
+		},
+		{
+			name: "machines with a failure message or reason are not provisioning or ready",
+			test: func(g *WithT, cp *ControlPlane) {
+				cp.Machines = NewFilterableMachineCollection(
+					machine("1", withNodeRef(), withFailureReason("foo")),
+					machine("2", withNodeRef(), withFailureMessage("foo")),
+					machine("3", withInfrastructureReady(), withFailureReason("bar")),
+					machine("4", withInfrastructureReady(), withFailureMessage("bar")),
+					machine("5", withInfrastructureReady(), withNodeRef(), withFailureMessage("baz")),
+					machine("6", withInfrastructureReady(), withNodeRef(), withFailureReason("baz")),
+				)
+				g.Expect(cp.ProvisioningMachines().Names()).To(BeEmpty())
+				g.Expect(cp.UnhealthyMachines().Names()).To(BeEmpty())
+			},
+		},
+		{
+			name: "machines with an unhealthy annotation in clusters with at least 3 machines are unhealthy",
+			test: func(g *WithT, cp *ControlPlane) {
+				cp.Machines = NewFilterableMachineCollection(
+					machine("1", withNeedsRemediationCondition()),
+					machine("2", withNodeRef(), withInfrastructureReady()),
+					machine("3"),
+				)
+				g.Expect(cp.ProvisioningMachines().Names()).To(ConsistOf("3"))
+				g.Expect(cp.UnhealthyMachines().Names()).To(ConsistOf("1"))
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			g := NewWithT(t)
+			kcp := &controlplanev1.KubeadmControlPlane{}
+			cp := &ControlPlane{KCP: kcp}
+			tc.test(g, cp)
+		})
+	}
+}
+
 func failureDomain(controlPlane bool) clusterv1.FailureDomainSpec {
 	return clusterv1.FailureDomainSpec{
 		ControlPlane: controlPlane,
@@ -217,3 +275,45 @@ func withHash(hash string) machineOpt {
 		m.SetLabels(map[string]string{controlplanev1.KubeadmControlPlaneHashLabelKey: hash})
 	}
 }
+
+func withTimestamp(t time.Time) machineOpt {
+	return func(m *clusterv1.Machine) {
+		m.CreationTimestamp = metav1.NewTime(t)
+	}
+}
+
+func withValidHash(kcp controlplanev1.KubeadmControlPlaneSpec) machineOpt {
+	return func(m *clusterv1.Machine) {
+		withHash(hash.Compute(&kcp))(m)
+	}
+}
+
+func withNeedsRemediationCondition() machineOpt {
+	return func(m *clusterv1.Machine) {
+		conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, "some reason", "some severity", "")
+	}
+}
+
+func withNodeRef() machineOpt {
+	return func(m *clusterv1.Machine) {
+		m.Status.NodeRef = &corev1.ObjectReference{}
+	}
+}
+
+func withInfrastructureReady() machineOpt {
+	return func(m *clusterv1.Machine) {
+		m.Status.InfrastructureReady = true
+	}
+}
+
+func withFailureReason(reason string) machineOpt {
+	return func(m *clusterv1.Machine) {
+		failureReason := capierrors.MachineStatusError(reason)
+		m.Status.FailureReason = &failureReason
+	}
+}
+func withFailureMessage(msg string) machineOpt {
+	return func(m *clusterv1.Machine) {
+		m.Status.FailureMessage = pointer.StringPtr(msg)
+	}
+}
diff --git a/controlplane/kubeadm/internal/machine_collection.go b/controlplane/kubeadm/internal/machine_collection.go
index c1db81c8b192..296daa64b902 100644
--- a/controlplane/kubeadm/internal/machine_collection.go
+++ b/controlplane/kubeadm/internal/machine_collection.go
@@ -145,3 +145,13 @@ func (s FilterableMachineCollection) ConditionGetters() []conditions.Getter {
 	}
 	return res
 }
+
+// Names returns a slice of the names of each machine in the collection.
+// Useful for logging and test assertions.
+func (s FilterableMachineCollection) Names() []string {
+	names := make([]string, 0, s.Len())
+	for _, m := range s {
+		names = append(names, m.Name)
+	}
+	return names
+}
diff --git a/controlplane/kubeadm/internal/machinefilters/machine_filters.go b/controlplane/kubeadm/internal/machinefilters/machine_filters.go
index 69d4e70e6dfd..ee9a040a6394 100644
--- a/controlplane/kubeadm/internal/machinefilters/machine_filters.go
+++ b/controlplane/kubeadm/internal/machinefilters/machine_filters.go
@@ -197,3 +197,20 @@ func ControlPlaneSelectorForCluster(clusterName string) labels.Selector {
 		must(labels.NewRequirement(clusterv1.MachineControlPlaneLabelName, selection.Exists, []string{})),
 	)
 }
+
+// NeedsRemediation returns whether the machine has the
+// MachineOwnerRemediatedCondition set to false.
+func NeedsRemediation(m *clusterv1.Machine) bool {
+	return conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition)
+}
+
+// IsProvisioning returns whether the machine is missing its NodeRef or does
+// not have InfrastructureReady set to true.
+func IsProvisioning(m *clusterv1.Machine) bool {
+	return m.Status.NodeRef == nil || !m.Status.InfrastructureReady
+}
+
+// IsFailed returns whether the machine has a FailureMessage or a FailureReason.
+func IsFailed(m *clusterv1.Machine) bool {
+	return m.Status.FailureMessage != nil || m.Status.FailureReason != nil
+}