diff --git a/controlplane/kubeadm/api/v1alpha3/conversion.go b/controlplane/kubeadm/api/v1alpha3/conversion.go
index dd0b04c5e512..d3c496e2de9b 100644
--- a/controlplane/kubeadm/api/v1alpha3/conversion.go
+++ b/controlplane/kubeadm/api/v1alpha3/conversion.go
@@ -85,6 +85,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
 
 	dst.Spec.RolloutBefore = restored.Spec.RolloutBefore
 
+	if restored.Spec.RemediationStrategy != nil {
+		dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
+	}
+	if restored.Status.LastRemediation != nil {
+		dst.Status.LastRemediation = restored.Status.LastRemediation
+	}
+
 	return nil
 }
 
diff --git a/controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go b/controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go
index 94b5206584e7..ce3672490458 100644
--- a/controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go
+++ b/controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go
@@ -201,6 +201,7 @@ func autoConvert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha3_KubeadmControlPlane
 	// WARNING: in.RolloutBefore requires manual conversion: does not exist in peer-type
 	// WARNING: in.RolloutAfter requires manual conversion: does not exist in peer-type
 	out.RolloutStrategy = (*RolloutStrategy)(unsafe.Pointer(in.RolloutStrategy))
+	// WARNING: in.RemediationStrategy requires manual conversion: does not exist in peer-type
 	return nil
 }
 
@@ -257,6 +258,7 @@ func autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha3_KubeadmControlPla
 	} else {
 		out.Conditions = nil
 	}
+	// WARNING: in.LastRemediation requires manual conversion: does not exist in peer-type
 	return nil
 }
 
diff --git a/controlplane/kubeadm/api/v1alpha4/conversion.go b/controlplane/kubeadm/api/v1alpha4/conversion.go
index 9eaebf63b74a..6e77b8fb36d9 100644
--- a/controlplane/kubeadm/api/v1alpha4/conversion.go
+++ b/controlplane/kubeadm/api/v1alpha4/conversion.go
@@ -70,6 +70,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
 	dst.Spec.RolloutBefore = restored.Spec.RolloutBefore
 	dst.Spec.MachineTemplate.NodeVolumeDetachTimeout = restored.Spec.MachineTemplate.NodeVolumeDetachTimeout
 
+	if restored.Spec.RemediationStrategy != nil {
+		dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
+	}
+	if restored.Status.LastRemediation != nil {
+		dst.Status.LastRemediation = restored.Status.LastRemediation
+	}
+
 	return nil
 }
 
@@ -234,5 +241,11 @@ func Convert_v1beta1_KubeadmControlPlaneMachineTemplate_To_v1alpha4_KubeadmContr
 
 func Convert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in *controlplanev1.KubeadmControlPlaneSpec, out *KubeadmControlPlaneSpec, scope apiconversion.Scope) error {
 	// .RolloutBefore was added in v1beta1.
+	// .RemediationStrategy was added in v1beta1.
 	return autoConvert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in, out, scope)
 }
+
+func Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in *controlplanev1.KubeadmControlPlaneStatus, out *KubeadmControlPlaneStatus, scope apiconversion.Scope) error {
+	// .LastRemediation was added in v1beta1.
+	return autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in, out, scope)
+}
diff --git a/controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go b/controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go
index 83cdb2e4718d..efe50a58635e 100644
--- a/controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go
+++ b/controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go
@@ -77,11 +77,6 @@ func RegisterConversions(s *runtime.Scheme) error {
 	}); err != nil {
 		return err
 	}
-	if err := s.AddGeneratedConversionFunc((*v1beta1.KubeadmControlPlaneStatus)(nil), (*KubeadmControlPlaneStatus)(nil), func(a, b interface{}, scope conversion.Scope) error {
-		return Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(a.(*v1beta1.KubeadmControlPlaneStatus), b.(*KubeadmControlPlaneStatus), scope)
-	}); err != nil {
-		return err
-	}
 	if err := s.AddGeneratedConversionFunc((*KubeadmControlPlaneTemplate)(nil), (*v1beta1.KubeadmControlPlaneTemplate)(nil), func(a, b interface{}, scope conversion.Scope) error {
 		return Convert_v1alpha4_KubeadmControlPlaneTemplate_To_v1beta1_KubeadmControlPlaneTemplate(a.(*KubeadmControlPlaneTemplate), b.(*v1beta1.KubeadmControlPlaneTemplate), scope)
 	}); err != nil {
@@ -157,6 +152,11 @@ func RegisterConversions(s *runtime.Scheme) error {
 	}); err != nil {
 		return err
 	}
+	if err := s.AddConversionFunc((*v1beta1.KubeadmControlPlaneStatus)(nil), (*KubeadmControlPlaneStatus)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(a.(*v1beta1.KubeadmControlPlaneStatus), b.(*KubeadmControlPlaneStatus), scope)
+	}); err != nil {
+		return err
+	}
 	if err := s.AddConversionFunc((*v1beta1.KubeadmControlPlaneTemplateResourceSpec)(nil), (*KubeadmControlPlaneSpec)(nil), func(a, b interface{}, scope conversion.Scope) error {
 		return Convert_v1beta1_KubeadmControlPlaneTemplateResourceSpec_To_v1alpha4_KubeadmControlPlaneSpec(a.(*v1beta1.KubeadmControlPlaneTemplateResourceSpec), b.(*KubeadmControlPlaneSpec), scope)
 	}); err != nil {
@@ -295,6 +295,7 @@ func autoConvert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlane
 	// WARNING: in.RolloutBefore requires manual conversion: does not exist in peer-type
 	out.RolloutAfter = (*v1.Time)(unsafe.Pointer(in.RolloutAfter))
 	out.RolloutStrategy = (*RolloutStrategy)(unsafe.Pointer(in.RolloutStrategy))
+	// WARNING: in.RemediationStrategy requires manual conversion: does not exist in peer-type
 	return nil
 }
 
@@ -352,14 +353,10 @@ func autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPla
 	} else {
 		out.Conditions = nil
 	}
+	// WARNING: in.LastRemediation requires manual conversion: does not exist in peer-type
 	return nil
 }
 
-// Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus is an autogenerated conversion function.
-func Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in *v1beta1.KubeadmControlPlaneStatus, out *KubeadmControlPlaneStatus, s conversion.Scope) error {
-	return autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in, out, s)
-}
-
 func autoConvert_v1alpha4_KubeadmControlPlaneTemplate_To_v1beta1_KubeadmControlPlaneTemplate(in *KubeadmControlPlaneTemplate, out *v1beta1.KubeadmControlPlaneTemplate, s conversion.Scope) error {
 	out.ObjectMeta = in.ObjectMeta
 	if err := Convert_v1alpha4_KubeadmControlPlaneTemplateSpec_To_v1beta1_KubeadmControlPlaneTemplateSpec(&in.Spec, &out.Spec, s); err != nil {
diff --git a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
index 823fb123f679..f4661380ccd3 100644
--- a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
+++ b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
@@ -17,6 +17,8 @@ limitations under the License.
 package v1beta1
 
 import (
+	"time"
+
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/intstr"
@@ -49,6 +51,23 @@ const (
 	// KubeadmClusterConfigurationAnnotation is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration.
 	// This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP.
 	KubeadmClusterConfigurationAnnotation = "controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration"
+
+	// RemediatingInProgressAnnotation is used to keep track that a KCP remediation is in progress, and more
+	// specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement.
+	// NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to
+	// failures in updating remediation retry or remediation count (both counters restart from zero).
+	RemediatingInProgressAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/remediation-in-progress"
+
+	// MachineRemediationForAnnotation is used to link a new machine to the unhealthy machine it is replacing;
+	// please note that in case of retry, when also the remediating machine fails, the system keep tracks
+	// the first machine of the sequence only.
+	// NOTE: if something external to CAPI removes this annotation the system this can lead to
+	// failures in updating remediation retry (the counter restarts from zero).
+	MachineRemediationForAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/remediation-for"
+
+	// DefaultMinHealthyPeriod defines the default minimum number of seconds before we consider a remediation on a
+	// machine unrelated from the previous remediation.
+	DefaultMinHealthyPeriod = 1 * time.Hour
 )
 
 // KubeadmControlPlaneSpec defines the desired state of KubeadmControlPlane.
@@ -91,6 +110,10 @@ type KubeadmControlPlaneSpec struct {
 	// +optional
 	// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
 	RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`
+
+	// The RemediationStrategy that controls how control plane machines remediation happens.
+	// +optional
+	RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
 }
 
 // KubeadmControlPlaneMachineTemplate defines the template for Machines
@@ -158,6 +181,40 @@ type RollingUpdate struct {
 	MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
 }
 
+// RemediationStrategy allows to define how control plane machines remediation happens.
+type RemediationStrategy struct {
+	// MaxRetry is the Max number of retry while attempting to remediate an unhealthy machine.
+	// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
+	// For example, given a control plane with three machines M1, M2, M3:
+	//
+	//	M1 become unhealthy; remediation happens, and M1bis is created as a replacement.
+	//	If M1-1 (replacement of M1) have problems while bootstrapping it will become unhealthy, and then be
+	//	remediated; such operation is considered a retry, remediation-retry #1.
+	//	If M1-2 (replacement of M1-2) becomes unhealthy, remediation-retry #2 will happen, etc.
+	//
+	// A retry could happen only after RetryPeriod from the previous retry.
+	// If a machine is marked as unhealthy after MinHealthyPeriod from the previous remediation expired,
+	// this is not considered anymore a retry because the new issue is assumed unrelated from the previous one.
+	//
+	// If not set, infinite retry will be attempted.
+	// +optional
+	MaxRetry *int32 `json:"maxRetry,omitempty"`
+
+	// RetryPeriod is the duration that KCP should wait before remediating a machine being created as a replacement
+	// for an unhealthy machine (a retry).
+	//
+	// If not set, a retry will happen immediately.
+	// +optional
+	RetryPeriod metav1.Duration `json:"retryDelaySeconds,omitempty"`
+
+	// MinHealthyPeriod defines the duration after which KCP will consider any failure to a machine unrelated
+	// from the previous one, and thus a new remediation is not considered a retry anymore.
+	//
+	// If not set, this value is defaulted to 1h.
+	// +optional
+	MinHealthyPeriod *metav1.Duration `json:"minHealthySeconds,omitempty"`
+}
+
 // KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.
 type KubeadmControlPlaneStatus struct {
 	// Selector is the label selector in string format to avoid introspection
@@ -223,6 +280,25 @@ type KubeadmControlPlaneStatus struct {
 	// Conditions defines current service state of the KubeadmControlPlane.
 	// +optional
 	Conditions clusterv1.Conditions `json:"conditions,omitempty"`
+
+	// LastRemediation stores info about last remediation performed.
+	// +optional
+	LastRemediation *LastRemediationStatus `json:"lastRemediation,omitempty"`
+}
+
+// LastRemediationStatus  stores info about last remediation performed.
+// NOTE: if for any reason information about last remediation are lost, RetryCount is going to restarts from 0 and thus
+// more remediation than expected might happen.
+type LastRemediationStatus struct {
+	// Machine is the machine name of the latest machine being remediated.
+	Machine string `json:"machine"`
+
+	// Timestamp is RFC 3339 date and time at which last remediation happened.
+	Timestamp metav1.Timestamp `json:"timestamp"`
+
+	// RetryCount used to keep track of remediation retry for the last remediated machine.
+	// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
+	RetryCount int32 `json:"retryCount"`
 }
 
 // +kubebuilder:object:root=true
diff --git a/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go b/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go
index 27f9d2ecd98a..010dd455c1ea 100644
--- a/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go
+++ b/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go
@@ -143,6 +143,11 @@ func (in *KubeadmControlPlaneSpec) DeepCopyInto(out *KubeadmControlPlaneSpec) {
 		*out = new(RolloutStrategy)
 		(*in).DeepCopyInto(*out)
 	}
+	if in.RemediationStrategy != nil {
+		in, out := &in.RemediationStrategy, &out.RemediationStrategy
+		*out = new(RemediationStrategy)
+		(*in).DeepCopyInto(*out)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeadmControlPlaneSpec.
@@ -175,6 +180,11 @@ func (in *KubeadmControlPlaneStatus) DeepCopyInto(out *KubeadmControlPlaneStatus
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
+	if in.LastRemediation != nil {
+		in, out := &in.LastRemediation, &out.LastRemediation
+		*out = new(LastRemediationStatus)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeadmControlPlaneStatus.
@@ -342,6 +352,48 @@ func (in *KubeadmControlPlaneTemplateSpec) DeepCopy() *KubeadmControlPlaneTempla
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *LastRemediationStatus) DeepCopyInto(out *LastRemediationStatus) {
+	*out = *in
+	out.Timestamp = in.Timestamp
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LastRemediationStatus.
+func (in *LastRemediationStatus) DeepCopy() *LastRemediationStatus {
+	if in == nil {
+		return nil
+	}
+	out := new(LastRemediationStatus)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *RemediationStrategy) DeepCopyInto(out *RemediationStrategy) {
+	*out = *in
+	if in.MaxRetry != nil {
+		in, out := &in.MaxRetry, &out.MaxRetry
+		*out = new(int32)
+		**out = **in
+	}
+	out.RetryPeriod = in.RetryPeriod
+	if in.MinHealthyPeriod != nil {
+		in, out := &in.MinHealthyPeriod, &out.MinHealthyPeriod
+		*out = new(v1.Duration)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemediationStrategy.
+func (in *RemediationStrategy) DeepCopy() *RemediationStrategy {
+	if in == nil {
+		return nil
+	}
+	out := new(RemediationStrategy)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *RollingUpdate) DeepCopyInto(out *RollingUpdate) {
 	*out = *in
diff --git a/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml b/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml
index ed855c19a8a8..6c869b11926d 100644
--- a/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml
+++ b/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml
@@ -3594,6 +3594,41 @@ spec:
                 required:
                 - infrastructureRef
                 type: object
+              remediationStrategy:
+                description: The RemediationStrategy that controls how control plane
+                  machines remediation happens.
+                properties:
+                  maxRetry:
+                    description: "MaxRetry is the Max number of retry while attempting
+                      to remediate an unhealthy machine. A retry happens when a machine
+                      that was created as a replacement for an unhealthy machine also
+                      fails. For example, given a control plane with three machines
+                      M1, M2, M3: \n M1 become unhealthy; remediation happens, and
+                      M1bis is created as a replacement. If M1-1 (replacement of M1)
+                      have problems while bootstrapping it will become unhealthy,
+                      and then be remediated; such operation is considered a retry,
+                      remediation-retry #1. If M1-2 (replacement of M1-2) becomes
+                      unhealthy, remediation-retry #2 will happen, etc. \n A retry
+                      could happen only after RetryPeriod from the previous retry.
+                      If a machine is marked as unhealthy after MinHealthyPeriod from
+                      the previous remediation expired, this is not considered anymore
+                      a retry because the new issue is assumed unrelated from the
+                      previous one. \n If not set, infinite retry will be attempted."
+                    format: int32
+                    type: integer
+                  minHealthySeconds:
+                    description: "MinHealthyPeriod defines the duration after which
+                      KCP will consider any failure to a machine unrelated from the
+                      previous one, and thus a new remediation is not considered a
+                      retry anymore. \n If not set, this value is defaulted to 1h."
+                    type: string
+                  retryDelaySeconds:
+                    description: "RetryPeriod is the duration that KCP should wait
+                      before remediating a machine being created as a replacement
+                      for an unhealthy machine (a retry). \n If not set, a retry will
+                      happen immediately."
+                    type: string
+                type: object
               replicas:
                 description: Number of desired machines. Defaults to 1. When stacked
                   etcd is used only odd numbers are permitted, as per [etcd best practice](https://etcd.io/docs/v3.3.12/faq/#why-an-odd-number-of-cluster-members).
@@ -3720,6 +3755,47 @@ spec:
                 description: Initialized denotes whether or not the control plane
                   has the uploaded kubeadm-config configmap.
                 type: boolean
+              lastRemediation:
+                description: LastRemediation stores info about last remediation performed.
+                properties:
+                  machine:
+                    description: Machine is the machine name of the latest machine
+                      being remediated.
+                    type: string
+                  retryCount:
+                    description: RetryCount used to keep track of remediation retry
+                      for the last remediated machine. A retry happens when a machine
+                      that was created as a replacement for an unhealthy machine also
+                      fails.
+                    format: int32
+                    type: integer
+                  timestamp:
+                    description: Timestamp is RFC 3339 date and time at which last
+                      remediation happened.
+                    properties:
+                      nanos:
+                        description: Non-negative fractions of a second at nanosecond
+                          resolution. Negative second values with fractions must still
+                          have non-negative nanos values that count forward in time.
+                          Must be from 0 to 999,999,999 inclusive. This field may
+                          be limited in precision depending on context.
+                        format: int32
+                        type: integer
+                      seconds:
+                        description: Represents seconds of UTC time since Unix epoch
+                          1970-01-01T00:00:00Z. Must be from 0001-01-01T00:00:00Z
+                          to 9999-12-31T23:59:59Z inclusive.
+                        format: int64
+                        type: integer
+                    required:
+                    - nanos
+                    - seconds
+                    type: object
+                required:
+                - machine
+                - retryCount
+                - timestamp
+                type: object
               observedGeneration:
                 description: ObservedGeneration is the latest generation observed
                   by the controller.
diff --git a/controlplane/kubeadm/internal/controllers/helpers.go b/controlplane/kubeadm/internal/controllers/helpers.go
index 30bfba3058a5..02fbd7bf240e 100644
--- a/controlplane/kubeadm/internal/controllers/helpers.go
+++ b/controlplane/kubeadm/internal/controllers/helpers.go
@@ -315,6 +315,13 @@ func (r *KubeadmControlPlaneReconciler) generateMachine(ctx context.Context, kcp
 		machine.Spec.NodeDeletionTimeout = kcp.Spec.MachineTemplate.NodeDeletionTimeout
 	}
 
+	// In case this machine is being created as a consequence of a remediation, then add an annotation
+	// tracking the name of the machine we are remediating for.
+	// NOTE: This is required in order to track remediation retries.
+	if v, ok := kcp.Annotations[controlplanev1.RemediatingInProgressAnnotation]; ok && v == "true" {
+		machine.Annotations[controlplanev1.MachineRemediationForAnnotation] = kcp.Status.LastRemediation.Machine
+	}
+
 	// Machine's bootstrap config may be missing ClusterConfiguration if it is not the first machine in the control plane.
 	// We store ClusterConfiguration as annotation here to detect any changes in KCP ClusterConfiguration and rollout the machine if any.
 	clusterConfig, err := json.Marshal(kcp.Spec.KubeadmConfigSpec.ClusterConfiguration)
@@ -332,5 +339,10 @@ func (r *KubeadmControlPlaneReconciler) generateMachine(ctx context.Context, kcp
 	if err := r.Client.Create(ctx, machine); err != nil {
 		return errors.Wrap(err, "failed to create machine")
 	}
+
+	// Remove the annotation tracking that a remediation is in progress (the remediation completed when
+	// the replacement machine have been created above).
+	delete(kcp.Annotations, controlplanev1.RemediatingInProgressAnnotation)
+
 	return nil
 }
diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go
index 429f8a2e954a..c7341b72ef49 100644
--- a/controlplane/kubeadm/internal/controllers/remediation.go
+++ b/controlplane/kubeadm/internal/controllers/remediation.go
@@ -19,9 +19,12 @@ package controllers
 import (
 	"context"
 	"fmt"
+	"time"
 
 	"github.com/blang/semver"
+	"github.com/go-logr/logr"
 	"github.com/pkg/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	kerrors "k8s.io/apimachinery/pkg/util/errors"
 	"k8s.io/klog/v2"
 	ctrl "sigs.k8s.io/controller-runtime"
@@ -31,6 +34,7 @@ import (
 	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
 	"sigs.k8s.io/cluster-api/util"
+	"sigs.k8s.io/cluster-api/util/annotations"
 	"sigs.k8s.io/cluster-api/util/conditions"
 	"sigs.k8s.io/cluster-api/util/patch"
 )
@@ -67,6 +71,11 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
 		return ctrl.Result{}, kerrors.NewAggregate(errList)
 	}
 
+	// Returns if another remediation is in progress but the new machine is not yet created.
+	if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediatingInProgressAnnotation]; ok {
+		return ctrl.Result{}, nil
+	}
+
 	// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
 	// and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation.
 	unhealthyMachines := controlPlane.UnhealthyMachines()
@@ -107,81 +116,92 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
 
 	// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
 	// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.
-	log.WithValues("Machine", klog.KObj(machineToBeRemediated))
-	desiredReplicas := int(*controlPlane.KCP.Spec.Replicas)
-
-	// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
-	if controlPlane.Machines.Len() <= 1 {
-		log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len())
-		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
-		return ctrl.Result{}, nil
-	}
-
-	// The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster
-	// is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first.
-	if controlPlane.Machines.Len() < desiredReplicas {
-		log.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "Replicas", desiredReplicas, "CurrentReplicas", controlPlane.Machines.Len())
-		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least %d control plane machines before triggering remediation", desiredReplicas)
-		return ctrl.Result{}, nil
-	}
+	log.WithValues("Machine", klog.KObj(machineToBeRemediated), "Initialized", controlPlane.KCP.Status.Initialized)
 
-	// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
-	if controlPlane.HasDeletingMachine() {
-		log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation")
-		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
+	// Check if KCP is allowed to remediate considering retry limits:
+	// - Remediation cannot happen because retryPeriod is not yet expired.
+	// - KCP already reached MaxRetries limit.
+	hasRetries, retryCount := r.checkRetryLimits(log, machineToBeRemediated, controlPlane)
+	if !hasRetries {
 		return ctrl.Result{}, nil
 	}
 
-	// Remediation MUST preserve etcd quorum. This rule ensures that we will not remove a member that would result in etcd
-	// losing a majority of members and thus become unable to field new requests.
-	if controlPlane.IsEtcdManaged() {
-		canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated)
-		if err != nil {
-			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
-			return ctrl.Result{}, err
+	// Executes checks that applies only if the control plane is already initialized; in this case KCP can
+	// remediate only if it can safely assume that the operation preserves the operation state of the existing cluster (or at least it doesn't make it worst).
+	if controlPlane.KCP.Status.Initialized {
+		// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
+		if controlPlane.Machines.Len() <= 1 {
+			log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len())
+			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
+			return ctrl.Result{}, nil
 		}
-		if !canSafelyRemediate {
-			log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation")
-			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
+
+		// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
+		if controlPlane.HasDeletingMachine() {
+			log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation")
+			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
 			return ctrl.Result{}, nil
 		}
-	}
 
-	workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster))
-	if err != nil {
-		log.Error(err, "Failed to create client to workload cluster")
-		return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
+		// Remediation MUST preserve etcd quorum. This rule ensures that KCP will not remove a member that would result in etcd
+		// losing a majority of members and thus become unable to field new requests.
+		if controlPlane.IsEtcdManaged() {
+			canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated)
+			if err != nil {
+				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
+				return ctrl.Result{}, err
+			}
+			if !canSafelyRemediate {
+				log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation")
+				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
+				return ctrl.Result{}, nil
+			}
+		}
 	}
 
-	// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
-	if controlPlane.IsEtcdManaged() {
-		etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
-		if etcdLeaderCandidate == nil {
-			log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
-			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
-				"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
-			return ctrl.Result{}, nil
-		}
-		if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
-			log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate))
-			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
-			return ctrl.Result{}, err
+	// Remediate the unhealthy control plane machine by deleting it.
+
+	// If the control plane is initialized, before deleting the machine:
+	// - if the machine hosts the etcd leader, forward etcd leadership to another machine.
+	// - delete the etcd member hosted on the machine being deleted.
+	// - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0)
+	if controlPlane.KCP.Status.Initialized {
+		workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster))
+		if err != nil {
+			log.Error(err, "Failed to create client to workload cluster")
+			return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
 		}
-		if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
-			log.Error(err, "Failed to remove etcd member for machine")
-			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
-			return ctrl.Result{}, err
+
+		// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
+		if controlPlane.IsEtcdManaged() {
+			etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
+			if etcdLeaderCandidate == nil {
+				log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
+				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
+					"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
+				return ctrl.Result{}, nil
+			}
+			if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
+				log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate))
+				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
+				return ctrl.Result{}, err
+			}
+			if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
+				log.Error(err, "Failed to remove etcd member for machine")
+				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
+				return ctrl.Result{}, err
+			}
 		}
-	}
 
-	parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version)
-	if err != nil {
-		return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version)
-	}
+		parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version)
+		if err != nil {
+			return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version)
+		}
 
-	if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated, parsedVersion); err != nil {
-		log.Error(err, "Failed to remove machine from kubeadm ConfigMap")
-		return ctrl.Result{}, err
+		if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated, parsedVersion); err != nil {
+			log.Error(err, "Failed to remove machine from kubeadm ConfigMap")
+			return ctrl.Result{}, err
+		}
 	}
 
 	if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil {
@@ -191,9 +211,107 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
 
 	log.Info("Remediating unhealthy machine")
 	conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+	// Set annotations tracking remediation is in progress (remediation will complete when a replacement machine is created).
+	annotations.AddAnnotations(controlPlane.KCP, map[string]string{
+		controlplanev1.RemediatingInProgressAnnotation: "",
+	})
+
+	// Stores info about last remediation.
+	// NOTE: Some of those info have been computed above, but they must surface on the object only here, after machine has been deleted.
+	controlPlane.KCP.Status.LastRemediation = &controlplanev1.LastRemediationStatus{
+		Machine:    machineToBeRemediated.Name,
+		Timestamp:  metav1.Timestamp{Seconds: time.Now().Unix()},
+		RetryCount: retryCount,
+	}
+
 	return ctrl.Result{Requeue: true}, nil
 }
 
+// checkRetryLimits checks if KCP is allowed to remediate considering retry limits:
+// - Remediation cannot happen because retryDelay is not yet expired.
+// - KCP already reached the maximum number of retries for a machine.
+// NOTE: Counting the number of retries is required In order to prevent infinite remediation e.g. in case the
+// first Control Plane machine is failing due to quota issue.
+func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machineToBeRemediated *clusterv1.Machine, controlPlane *internal.ControlPlane) (hasRetries bool, retryCount int32) {
+	// If there is no last remediation, this is the first try of a new retry sequence.
+	if controlPlane.KCP.Status.LastRemediation == nil {
+		return true, 0
+	}
+
+	// Gets MinHealthySeconds and RetryDelaySeconds from the remediation strategy, or use defaults.
+	minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod
+	if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod != nil {
+		minHealthyPeriod = controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration
+	}
+
+	retryDelay := time.Duration(0)
+	if controlPlane.KCP.Spec.RemediationStrategy != nil {
+		retryDelay = controlPlane.KCP.Spec.RemediationStrategy.RetryPeriod.Duration
+	}
+
+	// Gets the timestamp of the last remediation; if missing, default to a value
+	// that ensures both MinHealthySeconds and RetryDelaySeconds are expired.
+	// NOTE: this could potentially lead to executing more retries than expected or to executing retries before than
+	// expected, but this is considered acceptable.
+	// when the system recovers from someone/something manually removing the LastRemediatingTimeStampAnnotation.
+	max := func(x, y time.Duration) time.Duration {
+		if x < y {
+			return y
+		}
+		return x
+	}
+
+	lastRemediationTimestamp := time.Now().Add(-2 * max(minHealthyPeriod, retryDelay)).UTC()
+	if controlPlane.KCP.Status.LastRemediation != nil {
+		lastRemediationTimestamp = time.Unix(controlPlane.KCP.Status.LastRemediation.Timestamp.Seconds, int64(controlPlane.KCP.Status.LastRemediation.Timestamp.Nanos))
+	}
+
+	// Check if the machine being remediated has been created as a remediation for a previous unhealthy machine.
+	// NOTE: if someone/something manually removing the MachineRemediationForAnnotation on Machines or one of the LastRemediatedMachineAnnotation
+	// and LastRemediatedMachineRetryAnnotation on KCP, this could potentially lead to executing more retries than expected,
+	// but this is considered acceptable in such a case.
+	machineRemediatingFor := machineToBeRemediated.Name
+	if remediationFor, ok := machineToBeRemediated.Annotations[controlplanev1.MachineRemediationForAnnotation]; ok {
+		// If the remediation is happening before minHealthyPeriod is expired, then KCP considers this
+		// as a remediation for the same previously unhealthy machine.
+		// TODO: add example
+		if lastRemediationTimestamp.Add(minHealthyPeriod).After(time.Now()) {
+			machineRemediatingFor = remediationFor
+			log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, machineRemediatingFor))
+		}
+	}
+
+	// If remediation is happening for a different machine, this is the first try of a new retry sequence.
+	if controlPlane.KCP.Status.LastRemediation.Machine != machineRemediatingFor {
+		return true, 0
+	}
+
+	// Check if remediation can happen because retryDelay is passed.
+	if lastRemediationTimestamp.Add(retryDelay).After(time.Now().UTC()) {
+		log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation", retryDelay))
+		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryDelay)", retryDelay)
+		return false, 0
+	}
+
+	// Check if remediation can happen because of maxRetry is not reached yet, if defined.
+	retry := controlPlane.KCP.Status.LastRemediation.RetryCount
+
+	if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MaxRetry != nil {
+		maxRetry := *controlPlane.KCP.Spec.RemediationStrategy.MaxRetry
+		if retry >= maxRetry {
+			log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed %d times (MaxRetry %d). Skipping remediation", retry, maxRetry))
+			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed %d times (MaxRetry)", maxRetry)
+			return false, 0
+		}
+	}
+
+	retryCount = retry + 1
+
+	log.WithValues("RetryCount", retryCount)
+	return true, retryCount
+}
+
 // canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
 // without loosing etcd quorum.
 //
@@ -208,7 +326,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
 //   - etc.
 //
 // NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers
-// ans well as reconcileControlPlaneConditions before this.
+// as well as reconcileControlPlaneConditions before this.
 func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) {
 	log := ctrl.LoggerFrom(ctx)
 
@@ -258,10 +376,10 @@ func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Co
 			}
 		}
 
-		// If an etcd member does not have a corresponding machine, it is not possible to retrieve etcd member health
-		// so we are assuming the worst scenario and considering the member unhealthy.
+		// If an etcd member does not have a corresponding machine it is not possible to retrieve etcd member health,
+		// so KCP is assuming the worst scenario and considering the member unhealthy.
 		//
-		// NOTE: This should not happen given that we are running reconcileEtcdMembers before calling this method.
+		// NOTE: This should not happen given that KCP is running reconcileEtcdMembers before calling this method.
 		if machine == nil {
 			log.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "MemberName", etcdMember)
 			targetUnhealthyMembers++
diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go
index e6c13e1691b3..9ce9e72acf0a 100644
--- a/controlplane/kubeadm/internal/controllers/remediation_test.go
+++ b/controlplane/kubeadm/internal/controllers/remediation_test.go
@@ -42,7 +42,7 @@ import (
 
 func TestReconcileUnhealthyMachines(t *testing.T) {
 	g := NewWithT(t)
-	ctx := context.TODO()
+
 	r := &KubeadmControlPlaneReconciler{
 		Client:   env.GetClient(),
 		recorder: record.NewFakeRecorder(32),
@@ -53,7 +53,14 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 		g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
 	}()
 
-	t.Run("Remediation cleans up stuck remediation on previously unhealthy machines", func(t *testing.T) {
+	var removeFinalizer = func(g *WithT, m *clusterv1.Machine) {
+		patchHelper, err := patch.NewHelper(m, env.GetClient())
+		g.Expect(err).ToNot(HaveOccurred())
+		m.ObjectMeta.Finalizers = nil
+		g.Expect(patchHelper.Patch(ctx, m))
+	}
+
+	t.Run("It cleans up stuck remediation on previously unhealthy machines", func(t *testing.T) {
 		g := NewWithT(t)
 
 		m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation())
@@ -63,7 +70,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			Cluster:  &clusterv1.Cluster{},
 			Machines: collections.FromMachines(m),
 		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
 		g.Expect(err).ToNot(HaveOccurred())
@@ -79,6 +86,10 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			return errors.Errorf("condition %s still exists", clusterv1.MachineOwnerRemediatedCondition)
 		}, 10*time.Second).Should(Succeed())
 	})
+
+	// Generic preflight checks
+	// Those are ore flight checks that happen no matter if the control plane has been already initialized or not.
+
 	t.Run("Remediation does not happen if there are no unhealthy machines", func(t *testing.T) {
 		g := NewWithT(t)
 
@@ -87,12 +98,34 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			Cluster:  &clusterv1.Cluster{},
 			Machines: collections.New(),
 		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
 		g.Expect(err).ToNot(HaveOccurred())
 	})
-	t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is marked for deletion", func(t *testing.T) {
+	t.Run("reconcileUnhealthyMachines return early if another remediation is in progress", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m := getDeletingMachine(ns.Name, "m1-unhealthy-deleting-", withMachineHealthCheckFailed())
+		conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "")
+		conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				ObjectMeta: metav1.ObjectMeta{
+					Annotations: map[string]string{
+						controlplanev1.RemediatingInProgressAnnotation: "true",
+					},
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m),
+		}
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+	})
+	t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is already being deleted", func(t *testing.T) {
 		g := NewWithT(t)
 
 		m := getDeletingMachine(ns.Name, "m1-unhealthy-deleting-", withMachineHealthCheckFailed())
@@ -103,92 +136,655 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			Cluster:  &clusterv1.Cluster{},
 			Machines: collections.FromMachines(m),
 		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
 		g.Expect(err).ToNot(HaveOccurred())
 	})
+	t.Run("Remediation does not happen if MaxRetry is reached", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation("m0"))
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RemediationStrategy: &controlplanev1.RemediationStrategy{
+						MaxRetry: utilpointer.Int32(3),
+					},
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+					LastRemediation: &controlplanev1.LastRemediationStatus{
+						Machine:    "m0",
+						Timestamp:  metav1.Timestamp{Seconds: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).Unix()}, // minHealthy not expired yet.
+						RetryCount: 3,
+					},
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(3)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal("m0"))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed 3 times (MaxRetry)")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+	})
+	t.Run("Retry history is ignored if min healthy period is expired, default min healthy period", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation("m0"))
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RemediationStrategy: &controlplanev1.RemediationStrategy{
+						MaxRetry: utilpointer.Int32(3),
+					},
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+					LastRemediation: &controlplanev1.LastRemediationStatus{
+						Machine:    "m0",
+						Timestamp:  metav1.Timestamp{Seconds: time.Now().Add(-2 * controlplanev1.DefaultMinHealthyPeriod).Unix()}, // minHealthyPeriod already expired.
+						RetryCount: 3,
+					},
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(0)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m1.Name))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+	})
+	t.Run("Retry history is ignored if min healthy period is expired", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation("m0"))
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+
+		minHealthyPeriod := 4 * controlplanev1.DefaultMinHealthyPeriod // big min healthy period, so we are user that we are not using DefaultMinHealthyPeriod.
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RemediationStrategy: &controlplanev1.RemediationStrategy{
+						MaxRetry:         utilpointer.Int32(3),
+						MinHealthyPeriod: &metav1.Duration{Duration: minHealthyPeriod},
+					},
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+					LastRemediation: &controlplanev1.LastRemediationStatus{
+						Machine:    "m0",
+						Timestamp:  metav1.Timestamp{Seconds: time.Now().Add(-2 * minHealthyPeriod).Unix()}, // minHealthyPeriod already expired.
+						RetryCount: 3,
+					},
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(0)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m1.Name))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+	})
+	t.Run("Remediation does not happen if RetryDelay is not yet passed", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation("m0"))
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RemediationStrategy: &controlplanev1.RemediationStrategy{
+						MaxRetry:    utilpointer.Int32(3),
+						RetryPeriod: metav1.Duration{Duration: controlplanev1.DefaultMinHealthyPeriod}, // RetryDelaySeconds not yet expired.
+					},
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+					LastRemediation: &controlplanev1.LastRemediationStatus{
+						Machine:    "m0",
+						Timestamp:  metav1.Timestamp{Seconds: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).Unix()}, // minHealthyPeriod not yet expired.
+						RetryCount: 2,
+					},
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(2)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal("m0"))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest 1h0m0s (RetryDelay)")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+	})
+
+	// There are no preflight checks for when control plane is not yet initialized
+	// (it is the first CP, we can nuke it).
+
+	// Preflight checks for when control plane is already initialized.
+
 	t.Run("Remediation does not happen if desired replicas <= 1", func(t *testing.T) {
 		g := NewWithT(t)
 
 		m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(1),
-				RolloutStrategy: &controlplanev1.RolloutStrategy{
-					RollingUpdate: &controlplanev1.RollingUpdate{
-						MaxSurge: &intstr.IntOrString{
-							IntVal: 1,
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(1),
+					RolloutStrategy: &controlplanev1.RolloutStrategy{
+						RollingUpdate: &controlplanev1.RollingUpdate{
+							MaxSurge: &intstr.IntOrString{
+								IntVal: 1,
+							},
 						},
 					},
 				},
-			}},
-			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m),
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m),
+		}
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation).To(BeNil())
+
+		assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
+
+		g.Expect(env.Cleanup(ctx, m)).To(Succeed())
+	})
+	t.Run("Remediation does not happen if there is a deleting machine", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-")
+		m3 := getDeletingMachine(ns.Name, "m3-deleting") // NB. This machine is not created, it gets only added to control plane
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation).To(BeNil())
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
+
+		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
+	})
+	t.Run("Remediation does not happen if there is at least one additional unhealthy etcd member on a 3 machine CP", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
+		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation).To(BeNil())
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
+
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+	})
+	t.Run("Remediation does not happen if there are at least two additional unhealthy etcd member on a 5 machine CP", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
+		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
+		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
+		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(5),
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3, m4, m5),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation).To(BeNil())
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
+
+		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
+	})
+
+	// Remediation for when control plane is not yet initialized
+
+	t.Run("Remediation deletes unhealthy machine - 1 CP not initialized", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(1),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: false,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(0)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m1.Name))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
+	})
+	t.Run("Subsequent remediation of the same machine increase retry count - 1 CP not initialized", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(1),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: false,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1),
+		}
+
+		// First reconcile, remediate machine m1 for the first time
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(0)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m1.Name))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed())
+
+		machineRemediatingFor := m1.Name
+		for i := 2; i < 4; i++ {
+			// Simulate the creation of a replacement for 0.
+			mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(machineRemediatingFor))
+
+			// Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine.
+			delete(controlPlane.KCP.Annotations, controlplanev1.RemediatingInProgressAnnotation)
+
+			controlPlane.Machines = collections.FromMachines(mi)
+
+			// Reconcile unhealthy replacements for m1.
+			r.managementCluster = &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(collections.FromMachines(mi)),
+				},
+			}
+			ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+			g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+			g.Expect(err).ToNot(HaveOccurred())
+
+			g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+			g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(i - 1)))
+			g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(mi.Name))
+
+			assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+			err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi)
+			g.Expect(err).ToNot(HaveOccurred())
+			g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+			removeFinalizer(g, mi)
+			g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed())
+
+			machineRemediatingFor = mi.Name
+		}
+	})
+
+	// Remediation for when control plane is already initialized
+
+	t.Run("Remediation deletes unhealthy machine - 2 CP (during 1 CP rolling upgrade)", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(2),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
 		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
 
-		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
-		assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
 
-		g.Expect(env.Cleanup(ctx, m)).To(Succeed())
-	})
-	t.Run("Remediation does not happen if number of machines lower than desired", func(t *testing.T) {
-		g := NewWithT(t)
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(0)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m1.Name))
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
-		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-")
-		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas:        utilpointer.Int32(3),
-				RolloutStrategy: &controlplanev1.RolloutStrategy{},
-			}},
-			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2),
-		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
 
-		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
 		g.Expect(err).ToNot(HaveOccurred())
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least 3 control plane machines before triggering remediation")
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
 
+		removeFinalizer(g, m1)
 		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
 	})
-	t.Run("Remediation does not happen if there is a deleting machine", func(t *testing.T) {
+	t.Run("Remediation deletes unhealthy machine - 3 CP", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
-		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-")
-		m3 := getDeletingMachine(ns.Name, "m3-deleting") // NB. This machine is not created, it gets only added to control plane
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(3),
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
 			Machines: collections.FromMachines(m1, m2, m3),
 		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
 
-		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
 
-		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(0)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m1.Name))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
 	})
-	t.Run("Remediation does not happen if there is at least one additional unhealthy etcd member on a 3 machine CP", func(t *testing.T) {
+	t.Run("Remediation deletes unhealthy machine - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
-		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
-		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(3),
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(4),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2, m3),
+			Machines: collections.FromMachines(m1, m2, m3, m4),
 		}
 
 		r := &KubeadmControlPlaneReconciler{
@@ -201,29 +797,44 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			},
 		}
 
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
-		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
 
-		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(0)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m1.Name))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
 	})
-	t.Run("Remediation does not happen if there is at least two additional unhealthy etcd member on a 5 machine CP", func(t *testing.T) {
+	t.Run("Remediation fails gracefully if no healthy Control Planes are available to become etcd leader", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
-		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
-		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
-		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
-		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
+		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(5),
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(4),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2, m3, m4, m5),
+			Machines: collections.FromMachines(m1, m2, m3, m4),
 		}
 
 		r := &KubeadmControlPlaneReconciler{
@@ -235,35 +846,40 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 				},
 			},
 		}
+		_, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
+		g.Expect(err).ToNot(HaveOccurred())
 
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation).To(BeNil())
 
-		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
-		g.Expect(err).ToNot(HaveOccurred())
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
+			"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
 
-		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
 	})
-	t.Run("Remediation deletes unhealthy machine - 2 CP (during 1 CP rolling upgrade)", func(t *testing.T) {
+	t.Run("Subsequent remediation of the same machine increase retry count - 3 CP", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
-		patchHelper, err := patch.NewHelper(m1, env.GetClient())
-		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
-		g.Expect(patchHelper.Patch(ctx, m1))
-
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
 		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(2),
-				Version:  "v1.19.1",
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(1),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: false,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2),
+			Machines: collections.FromMachines(m1, m2, m3),
 		}
 
+		// First reconcile, remediate machine m1 for the first time
 		r := &KubeadmControlPlaneReconciler{
 			Client:   env.GetClient(),
 			recorder: record.NewFakeRecorder(32),
@@ -274,43 +890,98 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			},
 		}
 
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
 
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(0)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m1.Name))
+
 		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
 
 		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
 		g.Expect(err).ToNot(HaveOccurred())
 		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
 
-		patchHelper, err = patch.NewHelper(m1, env.GetClient())
-		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = nil
-		g.Expect(patchHelper.Patch(ctx, m1))
+		removeFinalizer(g, m1)
+		g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed())
 
-		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
+		machineRemediatingFor := m1.Name
+		for i := 5; i < 6; i++ {
+			// Simulate the creation of a replacement for m1.
+			mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(machineRemediatingFor))
+
+			// Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine.
+			delete(controlPlane.KCP.Annotations, controlplanev1.RemediatingInProgressAnnotation)
+			controlPlane.Machines = collections.FromMachines(mi, m2, m3)
+
+			// Reconcile unhealthy replacements for m1.
+			r.managementCluster = &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(collections.FromMachines(mi, m2, m3)),
+				},
+			}
+
+			ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+			g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+			g.Expect(err).ToNot(HaveOccurred())
+
+			g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+			g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(i - 4)))
+			g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(mi.Name))
+
+			assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+			err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi)
+			g.Expect(err).ToNot(HaveOccurred())
+			g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+			removeFinalizer(g, mi)
+			g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed())
+
+			machineRemediatingFor = mi.Name
+		}
+
+		g.Expect(env.CleanupAndWait(ctx, m2, m3)).To(Succeed())
 	})
-	t.Run("Remediation deletes unhealthy machine - 3 CP", func(t *testing.T) {
+}
+
+func TestReconcileUnhealthyMachinesSequences(t *testing.T) {
+	var removeFinalizer = func(g *WithT, m *clusterv1.Machine) {
+		patchHelper, err := patch.NewHelper(m, env.GetClient())
+		g.Expect(err).ToNot(HaveOccurred())
+		m.ObjectMeta.Finalizers = nil
+		g.Expect(patchHelper.Patch(ctx, m))
+	}
+
+	t.Run("Remediates the first CP machine having problems to come up", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
-		patchHelper, err := patch.NewHelper(m1, env.GetClient())
+		ns, err := env.CreateNamespace(ctx, "ns1")
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
-		g.Expect(patchHelper.Patch(ctx, m1))
+		defer func() {
+			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
+		}()
 
-		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
-		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+		// Control plane not initialized yet, First CP is unhealthy and gets remediated:
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(3),
-				Version:  "v1.19.1",
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: false,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2, m3),
+			Machines: collections.FromMachines(m1),
 		}
 
 		r := &KubeadmControlPlaneReconciler{
@@ -323,44 +994,99 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			},
 		}
 
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
 
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(0)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m1.Name))
+
 		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
 
 		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
 		g.Expect(err).ToNot(HaveOccurred())
 		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
 
-		patchHelper, err = patch.NewHelper(m1, env.GetClient())
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
+
+		// Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy.
+		// NOTE: scale up also resets remediation in progress and remediation counts.
+
+		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(m1.Name))
+		delete(controlPlane.KCP.Annotations, controlplanev1.RemediatingInProgressAnnotation)
+
+		// Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2)
+
+		controlPlane.Machines = collections.FromMachines(m2)
+		r.managementCluster = &fakeManagementCluster{
+			Workload: fakeWorkloadCluster{
+				EtcdMembersResult: nodes(controlPlane.Machines),
+			},
+		}
+
+		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = nil
-		g.Expect(patchHelper.Patch(ctx, m1))
 
-		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(1)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m2.Name))
+
+		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m2)
+		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
+
+		// Fake scaling up, which creates a remediation machine, which is healthy.
+		// NOTE: scale up also resets remediation in progress and remediation counts.
+
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(m1.Name))
+		delete(controlPlane.KCP.Annotations, controlplanev1.RemediatingInProgressAnnotation)
+
+		g.Expect(env.Cleanup(ctx, m3)).To(Succeed())
 	})
-	t.Run("Remediation deletes unhealthy machine - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) {
+
+	t.Run("Remediates the second CP machine having problems to come up", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
-		patchHelper, err := patch.NewHelper(m1, env.GetClient())
+		ns, err := env.CreateNamespace(ctx, "ns1")
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
-		g.Expect(patchHelper.Patch(ctx, m1))
+		defer func() {
+			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
+		}()
 
-		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
-		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
-		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember())
+		// Control plane initialized yet, First CP healthy, second CP is unhealthy and gets remediated:
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember())
+		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(4),
-				Version:  "v1.19.1",
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RolloutStrategy: &controlplanev1.RolloutStrategy{
+						RollingUpdate: &controlplanev1.RollingUpdate{
+							MaxSurge: &intstr.IntOrString{
+								IntVal: 1,
+							},
+						},
+					},
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2, m3, m4),
+			Machines: collections.FromMachines(m1, m2),
 		}
 
 		r := &KubeadmControlPlaneReconciler{
@@ -373,44 +1099,100 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			},
 		}
 
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
 
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(0)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m2.Name))
 
-		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2)
 		g.Expect(err).ToNot(HaveOccurred())
-		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m2)
+		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
+
+		// Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy.
+		// NOTE: scale up also resets remediation in progress and remediation counts.
 
-		patchHelper, err = patch.NewHelper(m1, env.GetClient())
+		m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(m2.Name))
+		delete(controlPlane.KCP.Annotations, controlplanev1.RemediatingInProgressAnnotation)
+
+		// Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2)
+
+		controlPlane.Machines = collections.FromMachines(m1, m3)
+		r.managementCluster = &fakeManagementCluster{
+			Workload: fakeWorkloadCluster{
+				EtcdMembersResult: nodes(controlPlane.Machines),
+			},
+		}
+
+		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = nil
-		g.Expect(patchHelper.Patch(ctx, m1))
 
-		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(1)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m3.Name))
+
+		assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m3.Namespace, Name: m3.Name}, m3)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m3)
+		g.Expect(env.Cleanup(ctx, m3)).To(Succeed())
+
+		// Fake scaling up, which creates a remediation machine, which is healthy.
+		// NOTE: scale up also resets remediation in progress and remediation counts.
+
+		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(m1.Name))
+		delete(controlPlane.KCP.Annotations, controlplanev1.RemediatingInProgressAnnotation)
+
+		g.Expect(env.Cleanup(ctx, m1, m4)).To(Succeed())
 	})
-	t.Run("Remediation does not happen if no healthy Control Planes are available to become etcd leader", func(t *testing.T) {
+
+	t.Run("Remediates only one CP machine in case of multiple failures", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
-		patchHelper, err := patch.NewHelper(m1, env.GetClient())
+		ns, err := env.CreateNamespace(ctx, "ns1")
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
-		g.Expect(patchHelper.Patch(ctx, m1))
+		defer func() {
+			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
+		}()
 
-		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
-		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
-		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
+		// Control plane initialized yet, First CP healthy, second and third CP are unhealthy. second gets remediated:
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember())
+		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+		m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(4),
-				Version:  "v1.19.1",
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RolloutStrategy: &controlplanev1.RolloutStrategy{
+						RollingUpdate: &controlplanev1.RollingUpdate{
+							MaxSurge: &intstr.IntOrString{
+								IntVal: 1,
+							},
+						},
+					},
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2, m3, m4),
+			Machines: collections.FromMachines(m1, m2, m3),
 		}
 
 		r := &KubeadmControlPlaneReconciler{
@@ -422,24 +1204,46 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 				},
 			},
 		}
-		_, err = r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
 
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
-			"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediatingInProgressAnnotation))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.RetryCount).To(Equal(int32(0)))
+		g.Expect(controlPlane.KCP.Status.LastRemediation.Machine).To(Equal(m2.Name))
+
+		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+		assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
 
-		patchHelper, err = patch.NewHelper(m1, env.GetClient())
+		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2)
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = nil
-		g.Expect(patchHelper.Patch(ctx, m1))
+		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
 
-		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
+		removeFinalizer(g, m2)
+		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
+
+		// Check next reconcile does not further remediate
+
+		controlPlane.Machines = collections.FromMachines(m1, m3)
+		r.managementCluster = &fakeManagementCluster{
+			Workload: fakeWorkloadCluster{
+				EtcdMembersResult: nodes(controlPlane.Machines),
+			},
+		}
+
+		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
 	})
 }
 
 func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 	g := NewWithT(t)
-	ctx := context.TODO()
 
 	ns, err := env.CreateNamespace(ctx, "ns1")
 	g.Expect(err).ToNot(HaveOccurred())
@@ -470,7 +1274,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeFalse())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -501,7 +1305,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -538,7 +1342,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -568,7 +1372,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeFalse())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -599,7 +1403,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -637,7 +1441,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -668,7 +1472,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeFalse())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -701,7 +1505,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -734,7 +1538,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeFalse())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -769,7 +1573,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -804,7 +1608,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeFalse())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -859,6 +1663,21 @@ func withNodeRef(ref string) machineOption {
 	}
 }
 
+func withRemediateForAnnotation(remediatedFor string) machineOption {
+	return func(machine *clusterv1.Machine) {
+		if machine.Annotations == nil {
+			machine.Annotations = map[string]string{}
+		}
+		machine.Annotations[controlplanev1.MachineRemediationForAnnotation] = remediatedFor
+	}
+}
+
+func withWaitBeforeDeleteFinalizer() machineOption {
+	return func(machine *clusterv1.Machine) {
+		machine.Finalizers = []string{"wait-before-delete"}
+	}
+}
+
 func createMachine(ctx context.Context, g *WithT, namespace, name string, options ...machineOption) *clusterv1.Machine {
 	m := &clusterv1.Machine{
 		ObjectMeta: metav1.ObjectMeta{
diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go
index 14117c3c18e5..23e11e52d0b9 100644
--- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go
+++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go
@@ -199,15 +199,19 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster
 		UID:        cluster.UID,
 	})
 
-	// Get the remote cluster cache to use as a client.Reader.
-	remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
-	if err != nil {
-		logger.Error(err, "error creating remote cluster cache")
-		return ctrl.Result{}, err
-	}
+	// If the cluster is already initialized, get the remote cluster cache to use as a client.Reader.
+	var remoteClient client.Client
+	if conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) {
+		var err error
+		remoteClient, err = r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
+		if err != nil {
+			logger.Error(err, "error creating remote cluster cache")
+			return ctrl.Result{}, err
+		}
 
-	if err := r.watchClusterNodes(ctx, cluster); err != nil {
-		return ctrl.Result{}, err
+		if err := r.watchClusterNodes(ctx, cluster); err != nil {
+			return ctrl.Result{}, err
+		}
 	}
 
 	// fetch all targets
diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go
index 97eb0659bf0c..bdd9c5861f2a 100644
--- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go
+++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go
@@ -31,6 +31,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
+	"sigs.k8s.io/cluster-api/util"
 	"sigs.k8s.io/cluster-api/util/annotations"
 	"sigs.k8s.io/cluster-api/util/conditions"
 	"sigs.k8s.io/cluster-api/util/patch"
@@ -111,8 +112,9 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 		return true, time.Duration(0)
 	}
 
-	// Don't penalize any Machine/Node if the control plane has not been initialized.
-	if !conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) {
+	// Don't penalize any Machine/Node if the control plane has not been initialized
+	// Exception of this rule are control plane machine itself, so the first control plane machine can be remediated.
+	if !conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && !util.IsControlPlaneMachine(t.Machine) {
 		logger.V(3).Info("Not evaluating target health because the control plane has not yet been initialized")
 		// Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated.
 		return false, 0
@@ -218,16 +220,18 @@ func (r *Reconciler) getTargetsFromMHC(ctx context.Context, logger logr.Logger,
 			Machine:     &machines[k],
 			patchHelper: patchHelper,
 		}
-		node, err := r.getNodeFromMachine(ctx, clusterClient, target.Machine)
-		if err != nil {
-			if !apierrors.IsNotFound(err) {
-				return nil, errors.Wrap(err, "error getting node")
+		if clusterClient != nil {
+			node, err := r.getNodeFromMachine(ctx, clusterClient, target.Machine)
+			if err != nil {
+				if !apierrors.IsNotFound(err) {
+					return nil, errors.Wrap(err, "error getting node")
+				}
+
+				// A node has been seen for this machine, but it no longer exists
+				target.nodeMissing = true
 			}
-
-			// A node has been seen for this machine, but it no longer exists
-			target.nodeMissing = true
+			target.Node = node
 		}
-		target.Node = node
 		targets = append(targets, target)
 	}
 	return targets, nil
diff --git a/test/e2e/config/docker.yaml b/test/e2e/config/docker.yaml
index 708ecef3b2e0..74a52cd3647e 100644
--- a/test/e2e/config/docker.yaml
+++ b/test/e2e/config/docker.yaml
@@ -290,3 +290,6 @@ intervals:
   node-drain/wait-deployment-available: ["3m", "10s"]
   node-drain/wait-control-plane: ["15m", "10s"]
   node-drain/wait-machine-deleted: ["2m", "10s"]
+  kcp-remediation/wait-machines: ["5m", "10s"]
+  kcp-remediation/check-machines-stable: ["30s", "5s"]
+  kcp-remediation/wait-machine-provisioned: ["5m", "10s"]
diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/cluster-with-kcp.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/cluster-with-kcp.yaml
new file mode 100644
index 000000000000..12516bab5ccd
--- /dev/null
+++ b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/cluster-with-kcp.yaml
@@ -0,0 +1,98 @@
+---
+# DockerCluster object referenced by the Cluster object
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+kind: DockerCluster
+metadata:
+  name: '${CLUSTER_NAME}'
+---
+# Cluster object with
+# - Reference to the KubeadmControlPlane object
+# - the label cni=${CLUSTER_NAME}-crs-0, so the cluster can be selected by the ClusterResourceSet.
+apiVersion: cluster.x-k8s.io/v1beta1
+kind: Cluster
+metadata:
+  name: '${CLUSTER_NAME}'
+  labels:
+    cni: "${CLUSTER_NAME}-crs-0"
+spec:
+  clusterNetwork:
+    services:
+      cidrBlocks: ['${DOCKER_SERVICE_CIDRS}']
+    pods:
+      cidrBlocks: ['${DOCKER_POD_CIDRS}']
+    serviceDomain: '${DOCKER_SERVICE_DOMAIN}'
+  infrastructureRef:
+    apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+    kind: DockerCluster
+    name: '${CLUSTER_NAME}'
+  controlPlaneRef:
+    kind: KubeadmControlPlane
+    apiVersion: controlplane.cluster.x-k8s.io/v1beta1
+    name: "${CLUSTER_NAME}-control-plane"
+---
+# DockerMachineTemplate object referenced by the KubeadmControlPlane object
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+kind: DockerMachineTemplate
+metadata:
+  name: "${CLUSTER_NAME}-control-plane"
+spec:
+  template:
+    spec: {}
+---
+# KubeadmControlPlane referenced by the Cluster
+kind: KubeadmControlPlane
+apiVersion: controlplane.cluster.x-k8s.io/v1beta1
+metadata:
+  name: "${CLUSTER_NAME}-control-plane"
+spec:
+  replicas: ${CONTROL_PLANE_MACHINE_COUNT}
+  machineTemplate:
+    infrastructureRef:
+      kind: DockerMachineTemplate
+      apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+      name: "${CLUSTER_NAME}-control-plane"
+  kubeadmConfigSpec:
+    clusterConfiguration:
+      controllerManager:
+        extraArgs: {enable-hostpath-provisioner: 'true'}
+      apiServer:
+        # host.docker.internal is required by kubetest when running on MacOS because of the way ports are proxied.
+        certSANs: [localhost, 127.0.0.1, 0.0.0.0, host.docker.internal]
+    initConfiguration:
+      nodeRegistration:
+        kubeletExtraArgs:
+          eviction-hard: 'nodefs.available<0%,nodefs.inodesFree<0%,imagefs.available<0%'
+    joinConfiguration:
+      nodeRegistration:
+        kubeletExtraArgs:
+          eviction-hard: 'nodefs.available<0%,nodefs.inodesFree<0%,imagefs.available<0%'
+    files:
+    - path: /wait-signal.sh
+      content: |
+        #!/bin/bash
+  
+        set -o errexit
+        set -o pipefail
+        
+        echo "Waiting for signal..."
+        
+        TOKEN=$1
+        SERVER=$2
+        NAMESPACE=$3
+        
+        while true;
+        do
+          sleep 1s
+  
+          signal=$(curl -k -s --header "Authorization: Bearer $TOKEN" $SERVER/api/v1/namespaces/$NAMESPACE/configmaps/mhc-test | jq -r .data.signal?)
+          echo "signal $signal"
+  
+          if [ "$signal" == "pass" ]; then
+             curl -k -s --header "Authorization: Bearer $TOKEN" -XPATCH -H "Content-Type: application/strategic-merge-patch+json" --data '{"data": {"signal": "ack-pass"}}' $SERVER/api/v1/namespaces/$NAMESPACE/configmaps/mhc-test
+             exit 0
+          fi
+        done
+      permissions: "0777"
+    preKubeadmCommands:
+    - ./wait-signal.sh "${TOKEN}" "${SERVER}" "${NAMESPACE}"
+  version: "${KUBERNETES_VERSION}"
diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/kustomization.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/kustomization.yaml
index e234e37be1b2..92761f83d820 100644
--- a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/kustomization.yaml
+++ b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/kustomization.yaml
@@ -1,5 +1,5 @@
 bases:
-  - ../bases/cluster-with-kcp.yaml
+  - cluster-with-kcp.yaml
   - ../bases/md.yaml
   - ../bases/crs.yaml
   - mhc.yaml
diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml
index 3ed3e0a9473a..39187cec0a40 100644
--- a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml
+++ b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml
@@ -1,7 +1,8 @@
 ---
 # MachineHealthCheck object with
-# - a selector that targets all the machines with label cluster.x-k8s.io/control-plane=""
-# - unhealthyConditions triggering remediation after 10s the condition is set
+# - a selector that targets all the machines with label cluster.x-k8s.io/control-plane="" and the mhc-test: "fail" (the label is used to trigger remediation in a controlled way - by adding CP under MHC control intentionally -)
+# - nodeStartupTimeout: 30s (to force remediation on nodes still provisioning)
+# - unhealthyConditions triggering remediation after 10s the e2e.remediation.condition condition is set to false (to force remediation on nodes already provisioned)
 apiVersion: cluster.x-k8s.io/v1beta1
 kind: MachineHealthCheck
 metadata:
@@ -12,6 +13,8 @@ spec:
   selector:
     matchLabels:
       cluster.x-k8s.io/control-plane: ""
+      mhc-test: "fail"
+  nodeStartupTimeout: 30s
   unhealthyConditions:
     - type: e2e.remediation.condition
       status: "False"
diff --git a/test/e2e/kcp_remediations.go b/test/e2e/kcp_remediations.go
new file mode 100644
index 000000000000..6d16633dfbd8
--- /dev/null
+++ b/test/e2e/kcp_remediations.go
@@ -0,0 +1,693 @@
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	corev1 "k8s.io/api/core/v1"
+	rbacv1 "k8s.io/api/rbac/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/client-go/tools/clientcmd"
+	"k8s.io/utils/pointer"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+
+	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
+	"sigs.k8s.io/cluster-api/test/e2e/internal/log"
+	"sigs.k8s.io/cluster-api/test/framework"
+	"sigs.k8s.io/cluster-api/test/framework/clusterctl"
+	"sigs.k8s.io/cluster-api/util"
+)
+
+const (
+	configMapName    = "mhc-test"
+	configMapDataKey = "signal"
+
+	failLabelValue = "fail"
+)
+
+// KCPRemediationSpecInput is the input for KCPRemediationSpec.
+type KCPRemediationSpecInput struct {
+	// This spec requires following intervals to be defined in order to work:
+	// - wait-cluster, used when waiting for the cluster infrastructure to be provisioned.
+	// - wait-machines, used when waiting for an old machine to be remediated and a new one provisioned.
+	// - check-machines-stable, used when checking that the current list of machines in stable.
+	// - wait-machine-provisioned, used when waiting for a machine to be provisioned after unblocking bootstrap.
+	E2EConfig             *clusterctl.E2EConfig
+	ClusterctlConfigPath  string
+	BootstrapClusterProxy framework.ClusterProxy
+	ArtifactFolder        string
+	SkipCleanup           bool
+
+	// Flavor, if specified, must refer to a template that has a MachineHealthCheck
+	// - 3 node CP, no workers
+	// - Control plane machines having a pre-kubeadm command that queries for a well-known ConfigMap on the management cluster,
+	//   holding up bootstrap until a signal is passed via the config map.
+	//   NOTE: In order for this to work communications from workload cluster to management cluster must be enabled.
+	// - An MHC targeting control plane machines with the mhc-test=fail labels and
+	//     nodeStartupTimeout: 30s
+	// 	   unhealthyConditions:
+	//     - type: e2e.remediation.condition
+	//       status: "False"
+	// 	     timeout: 10s
+	// If not specified, "kcp-remediation" is used.
+	Flavor *string
+}
+
+// KCPRemediationSpec implements a test that verifies that Machines are remediated by MHC during unhealthy conditions.
+func KCPRemediationSpec(ctx context.Context, inputGetter func() KCPRemediationSpecInput) {
+	var (
+		specName         = "kcp-remediation"
+		input            KCPRemediationSpecInput
+		namespace        *corev1.Namespace
+		cancelWatches    context.CancelFunc
+		clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
+	)
+
+	BeforeEach(func() {
+		Expect(ctx).NotTo(BeNil(), "ctx is required for %s spec", specName)
+		input = inputGetter()
+		Expect(input.E2EConfig).ToNot(BeNil(), "Invalid argument. input.E2EConfig can't be nil when calling %s spec", specName)
+		Expect(input.ClusterctlConfigPath).To(BeAnExistingFile(), "Invalid argument. input.ClusterctlConfigPath must be an existing file when calling %s spec", specName)
+		Expect(input.BootstrapClusterProxy).ToNot(BeNil(), "Invalid argument. input.BootstrapClusterProxy can't be nil when calling %s spec", specName)
+		Expect(os.MkdirAll(input.ArtifactFolder, 0750)).To(Succeed(), "Invalid argument. input.ArtifactFolder can't be created for %s spec", specName)
+		Expect(input.E2EConfig.Variables).To(HaveKey(KubernetesVersion))
+
+		// Setup a Namespace where to host objects for this spec and create a watcher for the namespace events.
+		namespace, cancelWatches = setupSpecNamespace(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder)
+	})
+
+	It("Should replace unhealthy machines", func() {
+		By("Creating a workload cluster")
+
+		// NOTE: This test is quite different from other tests, because it has to trigger failures on machines in a controlled ways.
+
+		// creates the mhc-test ConfigMap that will be used to control machines bootstrap during the remediation tests.
+		createConfigMapForMachinesBootstrapSignal(ctx, input.BootstrapClusterProxy.GetClient(), namespace.Name)
+
+		// Creates the workload cluster.
+		clusterResources = createWorkloadClusterAndWait(ctx, createWorkloadClusterAndWaitInput{
+			E2EConfig:            input.E2EConfig,
+			clusterctlConfigPath: input.ClusterctlConfigPath,
+			proxy:                input.BootstrapClusterProxy,
+			artifactFolder:       input.ArtifactFolder,
+			specName:             specName,
+			flavor:               input.Flavor,
+
+			// values to be injected in the template
+
+			namespace: namespace.Name,
+			// Token with credentials to use for accessing the ConfigMap on managements cluster from the workload cluster.
+			// NOTE: this func also setups credentials/RBAC rules and everything necessary to get the authentication authenticationToken.
+			authenticationToken: getAuthenticationToken(ctx, input.BootstrapClusterProxy, namespace.Name),
+			// Address to be used for accessing the management cluster from a workload cluster.
+			serverAddr: getServerAddr(input.BootstrapClusterProxy.GetKubeconfigPath()),
+		})
+
+		// The first CP machine comes up but it does not complete bootstrap
+
+		By("FIRST CONTROL PLANE MACHINE")
+
+		By("Wait for the cluster to get stuck with the first CP machine not completing the bootstrap")
+		allMachines, newMachines := waitForMachines(ctx, waitForMachinesInput{
+			lister:                          input.BootstrapClusterProxy.GetClient(),
+			namespace:                       namespace.Name,
+			clusterName:                     clusterResources.Cluster.Name,
+			expectedReplicas:                1,
+			waitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			checkMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(1))
+		Expect(newMachines).To(HaveLen(1))
+		firstMachineName := newMachines[0]
+		firstMachine := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      firstMachineName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(firstMachine), firstMachine)).To(Succeed(), "Failed to get machine %d", firstMachineName)
+		Expect(firstMachine.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", firstMachineName)
+
+		// Intentionally trigger remediation on the first CP, and validate the first machine is deleted and a replacement should come up.
+
+		By("REMEDIATING FIRST CONTROL PLANE MACHINE")
+
+		Byf("Add mhc-test:fail label to machine %s so it will be immediately remediated", firstMachineName)
+		firstMachineWithLabel := firstMachine.DeepCopy()
+		firstMachineWithLabel.Labels["mhc-test"] = failLabelValue
+		Expect(input.BootstrapClusterProxy.GetClient().Patch(ctx, firstMachineWithLabel, client.MergeFrom(firstMachine))).To(Succeed(), "Failed to patch machine %d", firstMachineName)
+
+		log.Logf("Wait for the first CP machine to be remediated, and the replacement machine to come up, but again get stuck with the Machine not completing the bootstrap")
+		allMachines, newMachines = waitForMachines(ctx, waitForMachinesInput{
+			lister:                          input.BootstrapClusterProxy.GetClient(),
+			namespace:                       namespace.Name,
+			clusterName:                     clusterResources.Cluster.Name,
+			expectedReplicas:                1,
+			expectedDeletedMachines:         []string{firstMachineName},
+			waitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			checkMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(1))
+		Expect(newMachines).To(HaveLen(1))
+		firstMachineReplacementName := newMachines[0]
+		firstMachineReplacement := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      firstMachineReplacementName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(firstMachineReplacement), firstMachineReplacement)).To(Succeed(), "Failed to get machine %d", firstMachineReplacementName)
+		Expect(firstMachineReplacement.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", firstMachineReplacementName)
+
+		// The firstMachine replacement is up, meaning that the test validated that remediation of the first CP machine works (note: first CP is a special case because the cluster is not initialized yet).
+		// In order to test remediation of other machine while provisioning we unblock bootstrap of the first CP replacement
+		// and wait for the second cp machine to come up.
+
+		By("FIRST CONTROL PLANE MACHINE SUCCESSFULLY REMEDIATED!")
+
+		Byf("Unblock bootstrap for Machine %s and wait for it to be provisioned", firstMachineReplacementName)
+		sendSignalToBootstrappingMachine(ctx, sendSignalToBootstrappingMachineInput{
+			client:    input.BootstrapClusterProxy.GetClient(),
+			namespace: namespace.Name,
+			machine:   firstMachineReplacementName,
+			signal:    "pass",
+		})
+		log.Logf("Waiting for Machine %s to be provisioned", firstMachineReplacementName)
+		Eventually(func() bool {
+			if err := input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(firstMachineReplacement), firstMachineReplacement); err != nil {
+				return false
+			}
+			return firstMachineReplacement.Status.NodeRef != nil
+		}, input.E2EConfig.GetIntervals(specName, "wait-machine-provisioned")...).Should(BeTrue(), "Machine %s failed to be provisioned", firstMachineReplacementName)
+
+		By("FIRST CONTROL PLANE MACHINE UP AND RUNNING!")
+		By("START PROVISIONING OF SECOND CONTROL PLANE MACHINE!")
+
+		By("Wait for the cluster to get stuck with the second CP machine not completing the bootstrap")
+		allMachines, newMachines = waitForMachines(ctx, waitForMachinesInput{
+			lister:                          input.BootstrapClusterProxy.GetClient(),
+			namespace:                       namespace.Name,
+			clusterName:                     clusterResources.Cluster.Name,
+			expectedReplicas:                2,
+			expectedDeletedMachines:         []string{},
+			expectedOldMachines:             []string{firstMachineReplacementName},
+			waitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			checkMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(2))
+		Expect(newMachines).To(HaveLen(1))
+		secondMachineName := newMachines[0]
+		secondMachine := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      secondMachineName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(secondMachine), secondMachine)).To(Succeed(), "Failed to get machine %d", secondMachineName)
+		Expect(secondMachine.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", secondMachineName)
+
+		// Intentionally trigger remediation on the second CP, and validate and validate also this one is deleted and a replacement should come up.
+
+		By("REMEDIATING SECOND CONTROL PLANE MACHINE")
+
+		Byf("Add mhc-test:fail label to machine %s so it will be immediately remediated", firstMachineName)
+		secondMachineWithLabel := secondMachine.DeepCopy()
+		secondMachineWithLabel.Labels["mhc-test"] = failLabelValue
+		Expect(input.BootstrapClusterProxy.GetClient().Patch(ctx, secondMachineWithLabel, client.MergeFrom(secondMachine))).To(Succeed(), "Failed to patch machine %d", secondMachineName)
+
+		log.Logf("Wait for the second CP machine to be remediated, and the replacement machine to come up, but again get stuck with the Machine not completing the bootstrap")
+		allMachines, newMachines = waitForMachines(ctx, waitForMachinesInput{
+			lister:                          input.BootstrapClusterProxy.GetClient(),
+			namespace:                       namespace.Name,
+			clusterName:                     clusterResources.Cluster.Name,
+			expectedReplicas:                2,
+			expectedDeletedMachines:         []string{secondMachineName},
+			expectedOldMachines:             []string{firstMachineReplacementName},
+			waitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			checkMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(2))
+		Expect(newMachines).To(HaveLen(1))
+		secondMachineReplacementName := newMachines[0]
+		secondMachineReplacement := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      secondMachineReplacementName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(secondMachineReplacement), secondMachineReplacement)).To(Succeed(), "Failed to get machine %d", secondMachineReplacementName)
+		Expect(secondMachineReplacement.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", secondMachineReplacementName)
+
+		// The secondMachine replacement is up, meaning that the test validated that remediation of the second CP machine works (note: this test remediation after the cluster is initialized, but not yet fully provisioned).
+		// In order to test remediation after provisioning we unblock bootstrap of the second CP replacement as well as for the third CP machine.
+		// and wait for the second cp machine to come up.
+
+		By("SECOND CONTROL PLANE MACHINE SUCCESSFULLY REMEDIATED!")
+
+		Byf("Unblock bootstrap for Machine %s and wait for it to be provisioned", secondMachineReplacementName)
+		sendSignalToBootstrappingMachine(ctx, sendSignalToBootstrappingMachineInput{
+			client:    input.BootstrapClusterProxy.GetClient(),
+			namespace: namespace.Name,
+			machine:   secondMachineReplacementName,
+			signal:    "pass",
+		})
+		log.Logf("Waiting for Machine %s to be provisioned", secondMachineReplacementName)
+		Eventually(func() bool {
+			if err := input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(secondMachineReplacement), secondMachineReplacement); err != nil {
+				return false
+			}
+			return secondMachineReplacement.Status.NodeRef != nil
+		}, input.E2EConfig.GetIntervals(specName, "wait-machine-provisioned")...).Should(BeTrue(), "Machine %s failed to be provisioned", secondMachineReplacementName)
+
+		By("SECOND CONTROL PLANE MACHINE UP AND RUNNING!")
+		By("START PROVISIONING OF THIRD CONTROL PLANE MACHINE!")
+
+		By("Wait for the cluster to get stuck with the third CP machine not completing the bootstrap")
+		allMachines, newMachines = waitForMachines(ctx, waitForMachinesInput{
+			lister:                          input.BootstrapClusterProxy.GetClient(),
+			namespace:                       namespace.Name,
+			clusterName:                     clusterResources.Cluster.Name,
+			expectedReplicas:                3,
+			expectedDeletedMachines:         []string{},
+			expectedOldMachines:             []string{firstMachineReplacementName, secondMachineReplacementName},
+			waitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			checkMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(3))
+		Expect(newMachines).To(HaveLen(1))
+		thirdMachineName := newMachines[0]
+		thirdMachine := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      thirdMachineName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(thirdMachine), thirdMachine)).To(Succeed(), "Failed to get machine %d", thirdMachineName)
+		Expect(thirdMachine.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", thirdMachineName)
+
+		Byf("Unblock bootstrap for Machine %s and wait for it to be provisioned", thirdMachineName)
+		sendSignalToBootstrappingMachine(ctx, sendSignalToBootstrappingMachineInput{
+			client:    input.BootstrapClusterProxy.GetClient(),
+			namespace: namespace.Name,
+			machine:   thirdMachineName,
+			signal:    "pass",
+		})
+		log.Logf("Waiting for Machine %s to be provisioned", thirdMachineName)
+		Eventually(func() bool {
+			if err := input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(thirdMachine), thirdMachine); err != nil {
+				return false
+			}
+			return thirdMachine.Status.NodeRef != nil
+		}, input.E2EConfig.GetIntervals(specName, "wait-machine-provisioned")...).Should(BeTrue(), "Machine %s failed to be provisioned", thirdMachineName)
+
+		// All three CP machines are up.
+
+		By("ALL THE CONTROL PLANE MACHINES SUCCESSFULLY PROVISIONED!")
+
+		// We now want to test remediation of a CP machine already provisioned.
+		// In order to do so we need to apply both mhc-test:fail as well as setting an unhealthy condition in order to trigger remediation
+
+		By("REMEDIATING THIRD CP")
+
+		Byf("Add mhc-test:fail label to machine %s and set an unhealthy condition on the node so it will be immediately remediated", thirdMachineName)
+		thirdMachineWithLabel := thirdMachine.DeepCopy()
+		thirdMachineWithLabel.Labels["mhc-test"] = failLabelValue
+		Expect(input.BootstrapClusterProxy.GetClient().Patch(ctx, thirdMachineWithLabel, client.MergeFrom(thirdMachine))).To(Succeed(), "Failed to patch machine %d", thirdMachineName)
+
+		unhealthyNodeCondition := corev1.NodeCondition{
+			Type:               "e2e.remediation.condition",
+			Status:             "False",
+			LastTransitionTime: metav1.Time{Time: time.Now()},
+		}
+		framework.PatchNodeCondition(ctx, framework.PatchNodeConditionInput{
+			ClusterProxy:  input.BootstrapClusterProxy,
+			Cluster:       clusterResources.Cluster,
+			NodeCondition: unhealthyNodeCondition,
+			Machine:       *thirdMachine, // TODO: make this a pointer.
+		})
+
+		log.Logf("Wait for the third CP machine to be remediated, and the replacement machine to come up, but again get stuck with the Machine not completing the bootstrap")
+		allMachines, newMachines = waitForMachines(ctx, waitForMachinesInput{
+			lister:                          input.BootstrapClusterProxy.GetClient(),
+			namespace:                       namespace.Name,
+			clusterName:                     clusterResources.Cluster.Name,
+			expectedReplicas:                3,
+			expectedDeletedMachines:         []string{thirdMachineName},
+			expectedOldMachines:             []string{firstMachineReplacementName, secondMachineReplacementName},
+			waitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			checkMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(3))
+		Expect(newMachines).To(HaveLen(1))
+		thirdMachineReplacementName := newMachines[0]
+		thirdMachineReplacement := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      thirdMachineReplacementName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(thirdMachineReplacement), thirdMachineReplacement)).To(Succeed(), "Failed to get machine %d", thirdMachineReplacementName)
+		Expect(thirdMachineReplacement.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", thirdMachineReplacementName)
+
+		// The thirdMachine replacement is up, meaning that the test validated that remediation of the third CP machine works (note: this test remediation after the cluster is fully provisioned).
+
+		By("THIRD CP SUCCESSFULLY REMEDIATED!")
+
+		Byf("Unblock bootstrap for Machine %s and wait for it to be provisioned", thirdMachineReplacementName)
+		sendSignalToBootstrappingMachine(ctx, sendSignalToBootstrappingMachineInput{
+			client:    input.BootstrapClusterProxy.GetClient(),
+			namespace: namespace.Name,
+			machine:   thirdMachineReplacementName,
+			signal:    "pass",
+		})
+		log.Logf("Waiting for Machine %s to be provisioned", thirdMachineReplacementName)
+		Eventually(func() bool {
+			if err := input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(thirdMachineReplacement), thirdMachineReplacement); err != nil {
+				return false
+			}
+			return thirdMachineReplacement.Status.NodeRef != nil
+		}, input.E2EConfig.GetIntervals(specName, "wait-machine-provisioned")...).Should(BeTrue(), "Machine %s failed to be provisioned", thirdMachineReplacementName)
+
+		// All three CP machines are up again.
+
+		By("CP BACK TO FULL OPERATIONAL STATE!")
+
+		By("PASSED!")
+	})
+
+	AfterEach(func() {
+		// Dumps all the resources in the spec namespace, then cleanups the cluster object and the spec namespace itself.
+		dumpSpecResourcesAndCleanup(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder, namespace, cancelWatches, clusterResources.Cluster, input.E2EConfig.GetIntervals, input.SkipCleanup)
+	})
+}
+
+func createConfigMapForMachinesBootstrapSignal(ctx context.Context, writer client.Writer, namespace string) {
+	cm := &corev1.ConfigMap{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      configMapName,
+			Namespace: namespace,
+		},
+		Data: map[string]string{
+			configMapDataKey: "hold",
+		},
+	}
+	Expect(writer.Create(ctx, cm)).To(Succeed(), "failed to create mhc-test config map")
+}
+
+type createWorkloadClusterAndWaitInput struct {
+	E2EConfig            *clusterctl.E2EConfig
+	clusterctlConfigPath string
+	proxy                framework.ClusterProxy
+	artifactFolder       string
+	specName             string
+	flavor               *string
+	namespace            string
+	authenticationToken  []byte
+	serverAddr           string
+}
+
+// createWorkloadClusterAndWait creates a workload cluster ard return ass soon as the cluster infrastructure is ready.
+// NOTE: clusterResources is filled only partially.
+func createWorkloadClusterAndWait(ctx context.Context, input createWorkloadClusterAndWaitInput) (clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult) {
+	clusterResources = new(clusterctl.ApplyClusterTemplateAndWaitResult)
+
+	// gets the cluster template
+	log.Logf("Getting the cluster template yaml")
+	clusterName := fmt.Sprintf("%s-%s", input.specName, util.RandomString(6))
+	workloadClusterTemplate := clusterctl.ConfigCluster(ctx, clusterctl.ConfigClusterInput{
+		// pass the clusterctl config file that points to the local provider repository created for this test,
+		ClusterctlConfigPath: input.clusterctlConfigPath,
+		// pass reference to the management cluster hosting this test
+		KubeconfigPath: input.proxy.GetKubeconfigPath(),
+
+		// select template
+		Flavor: pointer.StringDeref(input.flavor, "kcp-remediation"),
+		// define template variables
+		Namespace:                input.namespace,
+		ClusterName:              clusterName,
+		KubernetesVersion:        input.E2EConfig.GetVariable(KubernetesVersion),
+		ControlPlaneMachineCount: pointer.Int64(3),
+		WorkerMachineCount:       pointer.Int64(0),
+		InfrastructureProvider:   clusterctl.DefaultInfrastructureProvider,
+		// setup clusterctl logs folder
+		LogFolder: filepath.Join(input.artifactFolder, "clusters", input.proxy.GetName()),
+		// Adds authenticationToken, server address and namespace variables to be injected in the cluster template.
+		ClusterctlVariables: map[string]string{
+			"TOKEN":     string(input.authenticationToken),
+			"SERVER":    input.serverAddr,
+			"NAMESPACE": input.namespace,
+		},
+	})
+	Expect(workloadClusterTemplate).ToNot(BeNil(), "Failed to get the cluster template")
+
+	Eventually(func() error {
+		return input.proxy.Apply(ctx, workloadClusterTemplate)
+	}, 10*time.Second).Should(Succeed(), "Failed to apply the cluster template")
+
+	log.Logf("Waiting for the cluster infrastructure to be provisioned")
+	clusterResources.Cluster = framework.DiscoveryAndWaitForCluster(ctx, framework.DiscoveryAndWaitForClusterInput{
+		Getter:    input.proxy.GetClient(),
+		Namespace: input.namespace,
+		Name:      clusterName,
+	}, input.E2EConfig.GetIntervals(input.specName, "wait-cluster")...)
+
+	return clusterResources
+}
+
+type sendSignalToBootstrappingMachineInput struct {
+	client    client.Client
+	namespace string
+	machine   string
+	signal    string
+}
+
+// sendSignalToBootstrappingMachine sends a signal to a machine stuck during bootstrap.
+func sendSignalToBootstrappingMachine(ctx context.Context, input sendSignalToBootstrappingMachineInput) {
+	log.Logf("Sending bootstrap signal %s to Machine %s", input.signal, input.machine)
+	cm := &corev1.ConfigMap{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      configMapName,
+			Namespace: input.namespace,
+		},
+	}
+	Expect(input.client.Get(ctx, client.ObjectKeyFromObject(cm), cm)).To(Succeed(), "failed to get mhc-test config map")
+
+	cmWithSignal := cm.DeepCopy()
+	cmWithSignal.Data[configMapDataKey] = input.signal
+	Expect(input.client.Patch(ctx, cmWithSignal, client.MergeFrom(cm))).To(Succeed(), "failed to patch mhc-test config map")
+
+	log.Logf("Waiting for Machine %s to acknowledge signal %s has been received", input.machine, input.signal)
+	Eventually(func() string {
+		_ = input.client.Get(ctx, client.ObjectKeyFromObject(cmWithSignal), cmWithSignal)
+		return cmWithSignal.Data[configMapDataKey]
+	}, "1m", "10s").Should(Equal(fmt.Sprintf("ack-%s", input.signal)), "Failed to get ack signal from machine %s", input.machine)
+
+	machine := &clusterv1.Machine{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      input.machine,
+			Namespace: input.namespace,
+		},
+	}
+	Expect(input.client.Get(ctx, client.ObjectKeyFromObject(machine), machine)).To(Succeed())
+
+	// Resetting the signal in the config map
+	cmWithSignal.Data[configMapDataKey] = "hold"
+	Expect(input.client.Patch(ctx, cmWithSignal, client.MergeFrom(cm))).To(Succeed(), "failed to patch mhc-test config map")
+}
+
+type waitForMachinesInput struct {
+	lister                          framework.Lister
+	namespace                       string
+	clusterName                     string
+	expectedReplicas                int
+	expectedOldMachines             []string
+	expectedDeletedMachines         []string
+	waitForMachinesIntervals        []interface{}
+	checkMachineListStableIntervals []interface{}
+}
+
+// waitForMachines waits for machines to reach a well known state defined by number of replicas, a list of machines to exists,
+// a list of machines to not exists anymore. The func also check that the state is stable for some time before
+// returning the list of new machines.
+func waitForMachines(ctx context.Context, input waitForMachinesInput) (allMachineNames, newMachineNames []string) {
+	inClustersNamespaceListOption := client.InNamespace(input.namespace)
+	matchClusterListOption := client.MatchingLabels{
+		clusterv1.ClusterNameLabel:         input.clusterName,
+		clusterv1.MachineControlPlaneLabel: "",
+	}
+
+	expectedOldMachines := sets.NewString(input.expectedOldMachines...)
+	expectedDeletedMachines := sets.NewString(input.expectedDeletedMachines...)
+	allMachines := sets.NewString()
+	newMachines := sets.NewString()
+	machineList := &clusterv1.MachineList{}
+
+	// Waits for the desired set of machines to exist.
+	log.Logf("Waiting for %d machines, must have %s, must not have %s", input.expectedReplicas, expectedOldMachines.List(), expectedDeletedMachines.List())
+	Eventually(func() bool {
+		// Gets the list of machines
+		if err := input.lister.List(ctx, machineList, inClustersNamespaceListOption, matchClusterListOption); err != nil {
+			return false
+		}
+		allMachines = sets.NewString()
+		for i := range machineList.Items {
+			allMachines.Insert(machineList.Items[i].Name)
+		}
+
+		// Compute new machines (all - old - to be deleted)
+		newMachines = allMachines.Clone()
+		newMachines.Delete(expectedOldMachines.List()...)
+		newMachines.Delete(expectedDeletedMachines.List()...)
+
+		log.Logf(" - expected %d, got %d: %s, of which new %s, must have check: %t, must not have check: %t", input.expectedReplicas, allMachines.Len(), allMachines.List(), newMachines.List(), allMachines.HasAll(expectedOldMachines.List()...), !allMachines.HasAny(expectedDeletedMachines.List()...))
+
+		// Ensures all the expected old machines are still there.
+		if !allMachines.HasAll(expectedOldMachines.List()...) {
+			return false
+		}
+
+		// Ensures none of the machines to be deleted is still there.
+		if allMachines.HasAny(expectedDeletedMachines.List()...) {
+			return false
+		}
+
+		return allMachines.Len() == input.expectedReplicas
+	}, input.waitForMachinesIntervals...).Should(BeTrue(), "Failed to get the expected list of machines: got %s (expected %d machines, must have %s, must not have %s)", allMachines.List(), input.expectedReplicas, expectedOldMachines.List(), expectedDeletedMachines.List())
+	log.Logf("Got %d machines: %s", input.expectedReplicas, allMachines.List())
+
+	// Ensures the desired set of machines is stable (no further machines are created or deleted).
+	log.Logf("Checking the list of machines is stable")
+	allMachinesNow := sets.NewString()
+	Consistently(func() bool {
+		// Gets the list of machines
+		if err := input.lister.List(ctx, machineList, inClustersNamespaceListOption, matchClusterListOption); err != nil {
+			return false
+		}
+		allMachinesNow = sets.NewString()
+		for i := range machineList.Items {
+			allMachinesNow.Insert(machineList.Items[i].Name)
+		}
+
+		return allMachines.Len() == allMachinesNow.Len() && allMachines.HasAll(allMachinesNow.List()...)
+	}, input.checkMachineListStableIntervals...).Should(BeTrue(), "Expected list of machines is not stable: got %s, expected %s", allMachinesNow.List(), allMachines.List())
+
+	return allMachines.List(), newMachines.List()
+}
+
+// getServerAddr returns the address to be used for accessing the management cluster from a workload cluster.
+func getServerAddr(kubeconfigPath string) string {
+	kubeConfig, err := clientcmd.LoadFromFile(kubeconfigPath)
+	Expect(err).ToNot(HaveOccurred(), "failed to load management cluster's kubeconfig file")
+
+	clusterName := kubeConfig.Contexts[kubeConfig.CurrentContext].Cluster
+	Expect(clusterName).ToNot(BeEmpty(), "failed to identify current cluster name in management cluster's kubeconfig file")
+
+	serverAddr := kubeConfig.Clusters[clusterName].Server
+	Expect(serverAddr).ToNot(BeEmpty(), "failed to identify current server address in management cluster's kubeconfig file")
+
+	// On CAPD, if not running on Linux, we need to use Docker's proxy to connect back to the host
+	// to the CAPD cluster. Moby on Linux doesn't use the host.docker.internal DNS name.
+	if runtime.GOOS != "linux" {
+		serverAddr = strings.ReplaceAll(serverAddr, "127.0.0.1", "host.docker.internal")
+	}
+	return serverAddr
+}
+
+// getAuthenticationToken returns a bearer authenticationToken with minimal RBAC permissions to access the mhc-test ConfigMap that will be used
+// to control machines bootstrap during the remediation tests.
+func getAuthenticationToken(ctx context.Context, managementClusterProxy framework.ClusterProxy, namespace string) []byte {
+	sa := &corev1.ServiceAccount{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "mhc-test",
+			Namespace: namespace,
+		},
+	}
+	Expect(managementClusterProxy.GetClient().Create(ctx, sa)).To(Succeed(), "failed to create mhc-test service account")
+
+	role := &rbacv1.Role{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "mhc-test",
+			Namespace: namespace,
+		},
+		Rules: []rbacv1.PolicyRule{
+			{
+				Verbs:         []string{"get", "list", "patch"},
+				APIGroups:     []string{""},
+				Resources:     []string{"configmaps"},
+				ResourceNames: []string{"mhc-test"},
+			},
+		},
+	}
+	Expect(managementClusterProxy.GetClient().Create(ctx, role)).To(Succeed(), "failed to create mhc-test role")
+
+	roleBinding := &rbacv1.RoleBinding{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "mhc-test",
+			Namespace: namespace,
+		},
+		Subjects: []rbacv1.Subject{
+			{
+				Kind:      "ServiceAccount",
+				APIGroup:  "",
+				Name:      "mhc-test",
+				Namespace: namespace,
+			},
+		},
+		RoleRef: rbacv1.RoleRef{
+			APIGroup: rbacv1.SchemeGroupVersion.Group,
+			Kind:     "Role",
+			Name:     "mhc-test",
+		},
+	}
+	Expect(managementClusterProxy.GetClient().Create(ctx, roleBinding)).To(Succeed(), "failed to create mhc-test role binding")
+
+	cmd := exec.CommandContext(ctx, "kubectl", fmt.Sprintf("--kubeconfig=%s", managementClusterProxy.GetKubeconfigPath()), fmt.Sprintf("--namespace=%s", namespace), "create", "token", "mhc-test") //nolint:gosec
+	stdout, err := cmd.StdoutPipe()
+	Expect(err).ToNot(HaveOccurred(), "failed to get stdout for kubectl create authenticationToken")
+	stderr, err := cmd.StderrPipe()
+	Expect(err).ToNot(HaveOccurred(), "failed to get stderr for kubectl create authenticationToken")
+
+	Expect(cmd.Start()).To(Succeed(), "failed to run kubectl create authenticationToken")
+
+	output, err := io.ReadAll(stdout)
+	Expect(err).ToNot(HaveOccurred(), "failed to read stdout from kubectl create authenticationToken")
+	errout, err := io.ReadAll(stderr)
+	Expect(err).ToNot(HaveOccurred(), "failed to read stderr from kubectl create authenticationToken")
+
+	Expect(cmd.Wait()).To(Succeed(), "failed to wait kubectl create authenticationToken")
+	Expect(errout).To(BeEmpty())
+
+	return output
+}
diff --git a/test/e2e/mhc_remediations_test.go b/test/e2e/kcp_remediations_test.go
similarity index 83%
rename from test/e2e/mhc_remediations_test.go
rename to test/e2e/kcp_remediations_test.go
index 66471901d939..cdd9fd50efc4 100644
--- a/test/e2e/mhc_remediations_test.go
+++ b/test/e2e/kcp_remediations_test.go
@@ -23,9 +23,9 @@ import (
 	. "github.com/onsi/ginkgo/v2"
 )
 
-var _ = Describe("When testing unhealthy machines remediation", func() {
-	MachineRemediationSpec(ctx, func() MachineRemediationSpecInput {
-		return MachineRemediationSpecInput{
+var _ = Describe("When testing KCP remediation", func() {
+	KCPRemediationSpec(ctx, func() KCPRemediationSpecInput {
+		return KCPRemediationSpecInput{
 			E2EConfig:             e2eConfig,
 			ClusterctlConfigPath:  clusterctlConfigPath,
 			BootstrapClusterProxy: bootstrapClusterProxy,
diff --git a/test/e2e/mhc_remediations.go b/test/e2e/md_remediations.go
similarity index 61%
rename from test/e2e/mhc_remediations.go
rename to test/e2e/md_remediations.go
index b1c3989d4f61..7d95a032518f 100644
--- a/test/e2e/mhc_remediations.go
+++ b/test/e2e/md_remediations.go
@@ -32,8 +32,8 @@ import (
 	"sigs.k8s.io/cluster-api/util"
 )
 
-// MachineRemediationSpecInput is the input for MachineRemediationSpec.
-type MachineRemediationSpecInput struct {
+// MachineDeploymentRemediationSpecInput is the input for MachineDeploymentRemediationSpec.
+type MachineDeploymentRemediationSpecInput struct {
 	E2EConfig             *clusterctl.E2EConfig
 	ClusterctlConfigPath  string
 	BootstrapClusterProxy framework.ClusterProxy
@@ -41,26 +41,19 @@ type MachineRemediationSpecInput struct {
 	SkipCleanup           bool
 	ControlPlaneWaiters   clusterctl.ControlPlaneWaiters
 
-	// KCPFlavor, if specified, must refer to a template that has a MachineHealthCheck
-	// resource configured to match the control plane Machines (cluster.x-k8s.io/controlplane: "" label)
-	// and be configured to treat "e2e.remediation.condition" "False" as an unhealthy
-	// condition with a short timeout.
-	// If not specified, "kcp-remediation" is used.
-	KCPFlavor *string
-
-	// MDFlavor, if specified, must refer to a template that has a MachineHealthCheck
+	// Flavor, if specified, must refer to a template that has a MachineHealthCheck
 	// resource configured to match the MachineDeployment managed Machines and be
 	// configured to treat "e2e.remediation.condition" "False" as an unhealthy
 	// condition with a short timeout.
 	// If not specified, "md-remediation" is used.
-	MDFlavor *string
+	Flavor *string
 }
 
-// MachineRemediationSpec implements a test that verifies that Machines are remediated by MHC during unhealthy conditions.
-func MachineRemediationSpec(ctx context.Context, inputGetter func() MachineRemediationSpecInput) {
+// MachineDeploymentRemediationSpec implements a test that verifies that Machines are remediated by MHC during unhealthy conditions.
+func MachineDeploymentRemediationSpec(ctx context.Context, inputGetter func() MachineDeploymentRemediationSpecInput) {
 	var (
-		specName         = "mhc-remediation"
-		input            MachineRemediationSpecInput
+		specName         = "md-remediation"
+		input            MachineDeploymentRemediationSpecInput
 		namespace        *corev1.Namespace
 		cancelWatches    context.CancelFunc
 		clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
@@ -80,7 +73,7 @@ func MachineRemediationSpec(ctx context.Context, inputGetter func() MachineRemed
 		clusterResources = new(clusterctl.ApplyClusterTemplateAndWaitResult)
 	})
 
-	It("Should successfully trigger machine deployment remediation", func() {
+	It("Should replace unhealthy machines", func() {
 		By("Creating a workload cluster")
 
 		clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{
@@ -90,7 +83,7 @@ func MachineRemediationSpec(ctx context.Context, inputGetter func() MachineRemed
 				ClusterctlConfigPath:     input.ClusterctlConfigPath,
 				KubeconfigPath:           input.BootstrapClusterProxy.GetKubeconfigPath(),
 				InfrastructureProvider:   clusterctl.DefaultInfrastructureProvider,
-				Flavor:                   pointer.StringDeref(input.MDFlavor, "md-remediation"),
+				Flavor:                   pointer.StringDeref(input.Flavor, "md-remediation"),
 				Namespace:                namespace.Name,
 				ClusterName:              fmt.Sprintf("%s-%s", specName, util.RandomString(6)),
 				KubernetesVersion:        input.E2EConfig.GetVariable(KubernetesVersion),
@@ -103,6 +96,8 @@ func MachineRemediationSpec(ctx context.Context, inputGetter func() MachineRemed
 			WaitForMachineDeployments:    input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
 		}, clusterResources)
 
+		// TODO: this should be re-written like the KCP remediation test, because the current implementation
+		//   only tests that MHC applies the unhealthy condition but it doesn't test that the unhealthy machine is delete and a replacement machine comes up.
 		By("Setting a machine unhealthy and wait for MachineDeployment remediation")
 		framework.DiscoverMachineHealthChecksAndWaitForRemediation(ctx, framework.DiscoverMachineHealthCheckAndWaitForRemediationInput{
 			ClusterProxy:              input.BootstrapClusterProxy,
@@ -113,39 +108,6 @@ func MachineRemediationSpec(ctx context.Context, inputGetter func() MachineRemed
 		By("PASSED!")
 	})
 
-	It("Should successfully trigger KCP remediation", func() {
-		By("Creating a workload cluster")
-
-		clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{
-			ClusterProxy: input.BootstrapClusterProxy,
-			ConfigCluster: clusterctl.ConfigClusterInput{
-				LogFolder:                filepath.Join(input.ArtifactFolder, "clusters", input.BootstrapClusterProxy.GetName()),
-				ClusterctlConfigPath:     input.ClusterctlConfigPath,
-				KubeconfigPath:           input.BootstrapClusterProxy.GetKubeconfigPath(),
-				InfrastructureProvider:   clusterctl.DefaultInfrastructureProvider,
-				Flavor:                   pointer.StringDeref(input.KCPFlavor, "kcp-remediation"),
-				Namespace:                namespace.Name,
-				ClusterName:              fmt.Sprintf("%s-%s", specName, util.RandomString(6)),
-				KubernetesVersion:        input.E2EConfig.GetVariable(KubernetesVersion),
-				ControlPlaneMachineCount: pointer.Int64(3),
-				WorkerMachineCount:       pointer.Int64(1),
-			},
-			ControlPlaneWaiters:          input.ControlPlaneWaiters,
-			WaitForClusterIntervals:      input.E2EConfig.GetIntervals(specName, "wait-cluster"),
-			WaitForControlPlaneIntervals: input.E2EConfig.GetIntervals(specName, "wait-control-plane"),
-			WaitForMachineDeployments:    input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
-		}, clusterResources)
-
-		By("Setting a machine unhealthy and wait for KubeadmControlPlane remediation")
-		framework.DiscoverMachineHealthChecksAndWaitForRemediation(ctx, framework.DiscoverMachineHealthCheckAndWaitForRemediationInput{
-			ClusterProxy:              input.BootstrapClusterProxy,
-			Cluster:                   clusterResources.Cluster,
-			WaitForMachineRemediation: input.E2EConfig.GetIntervals(specName, "wait-machine-remediation"),
-		})
-
-		By("PASSED!")
-	})
-
 	AfterEach(func() {
 		// Dumps all the resources in the spec namespace, then cleanups the cluster object and the spec namespace itself.
 		dumpSpecResourcesAndCleanup(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder, namespace, cancelWatches, clusterResources.Cluster, input.E2EConfig.GetIntervals, input.SkipCleanup)
diff --git a/test/e2e/md_remediations_test.go b/test/e2e/md_remediations_test.go
new file mode 100644
index 000000000000..c35b60938d82
--- /dev/null
+++ b/test/e2e/md_remediations_test.go
@@ -0,0 +1,36 @@
+//go:build e2e
+// +build e2e
+
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+)
+
+var _ = Describe("When testing MachineDeployment remediation", func() {
+	MachineDeploymentRemediationSpec(ctx, func() MachineDeploymentRemediationSpecInput {
+		return MachineDeploymentRemediationSpecInput{
+			E2EConfig:             e2eConfig,
+			ClusterctlConfigPath:  clusterctlConfigPath,
+			BootstrapClusterProxy: bootstrapClusterProxy,
+			ArtifactFolder:        artifactFolder,
+			SkipCleanup:           skipCleanup,
+		}
+	})
+})
diff --git a/test/infrastructure/container/docker.go b/test/infrastructure/container/docker.go
index 204ec75ee7a3..a790f881b782 100644
--- a/test/infrastructure/container/docker.go
+++ b/test/infrastructure/container/docker.go
@@ -161,8 +161,7 @@ func (d *dockerRuntime) GetHostPort(ctx context.Context, containerName, portAndP
 }
 
 // ExecContainer executes a command in a running container and writes any output to the provided writer.
-func (d *dockerRuntime) ExecContainer(_ context.Context, containerName string, config *ExecContainerInput, command string, args ...string) error {
-	ctx := context.Background() // Let the command finish, even if it takes longer than the default timeout
+func (d *dockerRuntime) ExecContainer(ctx context.Context, containerName string, config *ExecContainerInput, command string, args ...string) error {
 	execConfig := types.ExecConfig{
 		// Run with privileges so we can remount etc..
 		// This might not make sense in the most general sense, but it is
diff --git a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go
index 2724a81dd46e..570052a91b12 100644
--- a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go
+++ b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go
@@ -295,7 +295,8 @@ func (r *DockerMachineReconciler) reconcileNormal(ctx context.Context, cluster *
 
 	// if the machine isn't bootstrapped, only then run bootstrap scripts
 	if !dockerMachine.Spec.Bootstrapped {
-		timeoutCtx, cancel := context.WithTimeout(ctx, 3*time.Minute)
+		// TODO: make this timeout configurable via DockerMachine API if it is required by the KCP remediation test.
+		timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
 		defer cancel()
 
 		// Check for bootstrap success
@@ -305,10 +306,29 @@ func (r *DockerMachineReconciler) reconcileNormal(ctx context.Context, cluster *
 		if err := externalMachine.CheckForBootstrapSuccess(timeoutCtx, false); err != nil {
 			bootstrapData, format, err := r.getBootstrapData(timeoutCtx, machine)
 			if err != nil {
-				log.Error(err, "failed to get bootstrap data")
 				return ctrl.Result{}, err
 			}
 
+			// Setup a go routing to check for the machine being deleted while running bootstrap as a
+			// synchronous process, e.g. due to remediation. The routine stops when timeoutCtx is Done
+			// (either because canceled intentionally due to machine deletion or canceled by the defer cancel()
+			// call when exiting from this func).
+			go func() {
+				for {
+					select {
+					case <-timeoutCtx.Done():
+						return
+					default:
+						updatedDockerMachine := &infrav1.DockerMachine{}
+						if err := r.Client.Get(ctx, client.ObjectKeyFromObject(dockerMachine), updatedDockerMachine); err == nil && !updatedDockerMachine.DeletionTimestamp.IsZero() {
+							cancel()
+							return
+						}
+						time.Sleep(5 * time.Second)
+					}
+				}
+			}()
+
 			// Run the bootstrap script. Simulates cloud-init/Ignition.
 			if err := externalMachine.ExecBootstrap(timeoutCtx, bootstrapData, format); err != nil {
 				conditions.MarkFalse(dockerMachine, infrav1.BootstrapExecSucceededCondition, infrav1.BootstrapFailedReason, clusterv1.ConditionSeverityWarning, "Repeating bootstrap")
@@ -331,6 +351,13 @@ func (r *DockerMachineReconciler) reconcileNormal(ctx context.Context, cluster *
 		return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
 	}
 
+	// If the control plane is not yet initialized, there is no API server to contact to get the ProviderID for the Node
+	// hosted on this machine, so return early.
+	// NOTE: we are using RequeueAfter with a short interval in order to make test execution time more stable.
+	if !conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) {
+		return ctrl.Result{RequeueAfter: 15 * time.Second}, nil
+	}
+
 	// Usually a cloud provider will do this, but there is no docker-cloud provider.
 	// Requeue if there is an error, as this is likely momentary load balancer
 	// state changes during control plane provisioning.