From e98e2de6e64414e998928ec6a6d51d7deeb4060c Mon Sep 17 00:00:00 2001
From: fabriziopandini <fpandini@vmware.com>
Date: Sat, 21 Jan 2023 13:55:55 +0100
Subject: [PATCH] Add support for KCP remediation during cluster provisioning
 Co-authored-by: sbueringer <buringerst@vmware.com>

---
 .../kubeadm/api/v1alpha3/conversion.go        |    7 +
 .../api/v1alpha3/zz_generated.conversion.go   |    2 +
 .../kubeadm/api/v1alpha4/conversion.go        |   17 +
 .../api/v1alpha4/zz_generated.conversion.go   |   17 +-
 .../v1beta1/kubeadm_control_plane_types.go    |   86 ++
 .../v1beta1/kubeadm_control_plane_webhook.go  |    2 +
 .../kubeadm_control_plane_webhook_test.go     |    5 +
 .../kubeadmcontrolplanetemplate_types.go      |    4 +
 .../api/v1beta1/zz_generated.deepcopy.go      |   57 +
 ...cluster.x-k8s.io_kubeadmcontrolplanes.yaml |   69 +
 ...x-k8s.io_kubeadmcontrolplanetemplates.yaml |   49 +
 .../kubeadm/internal/controllers/helpers.go   |   12 +
 .../internal/controllers/helpers_test.go      |    5 +
 .../internal/controllers/remediation.go       |  312 ++++-
 .../internal/controllers/remediation_test.go  | 1128 ++++++++++++++---
 .../kubeadm/internal/controllers/status.go    |   28 +
 .../src/developer/providers/v1.3-to-v1.4.md   |    2 +
 .../src/reference/labels_and_annotations.md   |    2 +
 .../machinehealthcheck_controller.go          |   20 +-
 .../machinehealthcheck_targets.go             |   38 +-
 test/e2e/config/docker.yaml                   |    3 +
 .../cluster-with-kcp.yaml                     |   98 ++
 .../kustomization.yaml                        |    2 +-
 .../cluster-template-kcp-remediation/mhc.yaml |    7 +-
 test/e2e/kcp_remediations.go                  |  687 ++++++++++
 ...tions_test.go => kcp_remediations_test.go} |    6 +-
 ...mhc_remediations.go => md_remediations.go} |   62 +-
 test/e2e/md_remediations_test.go              |   36 +
 test/infrastructure/container/docker.go       |    3 +-
 .../controllers/dockermachine_controller.go   |   23 +-
 30 files changed, 2483 insertions(+), 306 deletions(-)
 create mode 100644 test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/cluster-with-kcp.yaml
 create mode 100644 test/e2e/kcp_remediations.go
 rename test/e2e/{mhc_remediations_test.go => kcp_remediations_test.go} (83%)
 rename test/e2e/{mhc_remediations.go => md_remediations.go} (61%)
 create mode 100644 test/e2e/md_remediations_test.go

diff --git a/controlplane/kubeadm/api/v1alpha3/conversion.go b/controlplane/kubeadm/api/v1alpha3/conversion.go
index 0061e6331a8a..e8fb091e02d0 100644
--- a/controlplane/kubeadm/api/v1alpha3/conversion.go
+++ b/controlplane/kubeadm/api/v1alpha3/conversion.go
@@ -99,6 +99,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
 		dst.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
 	}
 
+	if restored.Spec.RemediationStrategy != nil {
+		dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
+	}
+	if restored.Status.LastRemediation != nil {
+		dst.Status.LastRemediation = restored.Status.LastRemediation
+	}
+
 	return nil
 }
 
diff --git a/controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go b/controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go
index 94b5206584e7..ce3672490458 100644
--- a/controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go
+++ b/controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go
@@ -201,6 +201,7 @@ func autoConvert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha3_KubeadmControlPlane
 	// WARNING: in.RolloutBefore requires manual conversion: does not exist in peer-type
 	// WARNING: in.RolloutAfter requires manual conversion: does not exist in peer-type
 	out.RolloutStrategy = (*RolloutStrategy)(unsafe.Pointer(in.RolloutStrategy))
+	// WARNING: in.RemediationStrategy requires manual conversion: does not exist in peer-type
 	return nil
 }
 
@@ -257,6 +258,7 @@ func autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha3_KubeadmControlPla
 	} else {
 		out.Conditions = nil
 	}
+	// WARNING: in.LastRemediation requires manual conversion: does not exist in peer-type
 	return nil
 }
 
diff --git a/controlplane/kubeadm/api/v1alpha4/conversion.go b/controlplane/kubeadm/api/v1alpha4/conversion.go
index 9fae7f390022..1637b272a82c 100644
--- a/controlplane/kubeadm/api/v1alpha4/conversion.go
+++ b/controlplane/kubeadm/api/v1alpha4/conversion.go
@@ -84,6 +84,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
 		dst.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
 	}
 
+	if restored.Spec.RemediationStrategy != nil {
+		dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
+	}
+	if restored.Status.LastRemediation != nil {
+		dst.Status.LastRemediation = restored.Status.LastRemediation
+	}
+
 	return nil
 }
 
@@ -173,6 +180,10 @@ func (src *KubeadmControlPlaneTemplate) ConvertTo(dstRaw conversion.Hub) error {
 		dst.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
 	}
 
+	if restored.Spec.Template.Spec.RemediationStrategy != nil {
+		dst.Spec.Template.Spec.RemediationStrategy = restored.Spec.Template.Spec.RemediationStrategy
+	}
+
 	return nil
 }
 
@@ -262,5 +273,11 @@ func Convert_v1beta1_KubeadmControlPlaneMachineTemplate_To_v1alpha4_KubeadmContr
 
 func Convert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in *controlplanev1.KubeadmControlPlaneSpec, out *KubeadmControlPlaneSpec, scope apiconversion.Scope) error {
 	// .RolloutBefore was added in v1beta1.
+	// .RemediationStrategy was added in v1beta1.
 	return autoConvert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in, out, scope)
 }
+
+func Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in *controlplanev1.KubeadmControlPlaneStatus, out *KubeadmControlPlaneStatus, scope apiconversion.Scope) error {
+	// .LastRemediation was added in v1beta1.
+	return autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in, out, scope)
+}
diff --git a/controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go b/controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go
index 83cdb2e4718d..efe50a58635e 100644
--- a/controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go
+++ b/controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go
@@ -77,11 +77,6 @@ func RegisterConversions(s *runtime.Scheme) error {
 	}); err != nil {
 		return err
 	}
-	if err := s.AddGeneratedConversionFunc((*v1beta1.KubeadmControlPlaneStatus)(nil), (*KubeadmControlPlaneStatus)(nil), func(a, b interface{}, scope conversion.Scope) error {
-		return Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(a.(*v1beta1.KubeadmControlPlaneStatus), b.(*KubeadmControlPlaneStatus), scope)
-	}); err != nil {
-		return err
-	}
 	if err := s.AddGeneratedConversionFunc((*KubeadmControlPlaneTemplate)(nil), (*v1beta1.KubeadmControlPlaneTemplate)(nil), func(a, b interface{}, scope conversion.Scope) error {
 		return Convert_v1alpha4_KubeadmControlPlaneTemplate_To_v1beta1_KubeadmControlPlaneTemplate(a.(*KubeadmControlPlaneTemplate), b.(*v1beta1.KubeadmControlPlaneTemplate), scope)
 	}); err != nil {
@@ -157,6 +152,11 @@ func RegisterConversions(s *runtime.Scheme) error {
 	}); err != nil {
 		return err
 	}
+	if err := s.AddConversionFunc((*v1beta1.KubeadmControlPlaneStatus)(nil), (*KubeadmControlPlaneStatus)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(a.(*v1beta1.KubeadmControlPlaneStatus), b.(*KubeadmControlPlaneStatus), scope)
+	}); err != nil {
+		return err
+	}
 	if err := s.AddConversionFunc((*v1beta1.KubeadmControlPlaneTemplateResourceSpec)(nil), (*KubeadmControlPlaneSpec)(nil), func(a, b interface{}, scope conversion.Scope) error {
 		return Convert_v1beta1_KubeadmControlPlaneTemplateResourceSpec_To_v1alpha4_KubeadmControlPlaneSpec(a.(*v1beta1.KubeadmControlPlaneTemplateResourceSpec), b.(*KubeadmControlPlaneSpec), scope)
 	}); err != nil {
@@ -295,6 +295,7 @@ func autoConvert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlane
 	// WARNING: in.RolloutBefore requires manual conversion: does not exist in peer-type
 	out.RolloutAfter = (*v1.Time)(unsafe.Pointer(in.RolloutAfter))
 	out.RolloutStrategy = (*RolloutStrategy)(unsafe.Pointer(in.RolloutStrategy))
+	// WARNING: in.RemediationStrategy requires manual conversion: does not exist in peer-type
 	return nil
 }
 
@@ -352,14 +353,10 @@ func autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPla
 	} else {
 		out.Conditions = nil
 	}
+	// WARNING: in.LastRemediation requires manual conversion: does not exist in peer-type
 	return nil
 }
 
-// Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus is an autogenerated conversion function.
-func Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in *v1beta1.KubeadmControlPlaneStatus, out *KubeadmControlPlaneStatus, s conversion.Scope) error {
-	return autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in, out, s)
-}
-
 func autoConvert_v1alpha4_KubeadmControlPlaneTemplate_To_v1beta1_KubeadmControlPlaneTemplate(in *KubeadmControlPlaneTemplate, out *v1beta1.KubeadmControlPlaneTemplate, s conversion.Scope) error {
 	out.ObjectMeta = in.ObjectMeta
 	if err := Convert_v1alpha4_KubeadmControlPlaneTemplateSpec_To_v1beta1_KubeadmControlPlaneTemplateSpec(&in.Spec, &out.Spec, s); err != nil {
diff --git a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
index 823fb123f679..272ffd12a4a1 100644
--- a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
+++ b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
@@ -17,6 +17,8 @@ limitations under the License.
 package v1beta1
 
 import (
+	"time"
+
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/intstr"
@@ -49,6 +51,23 @@ const (
 	// KubeadmClusterConfigurationAnnotation is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration.
 	// This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP.
 	KubeadmClusterConfigurationAnnotation = "controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration"
+
+	// RemediationInProgressAnnotation is used to keep track that a KCP remediation is in progress, and more
+	// specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement.
+	// NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to
+	// failures in updating remediation retry or remediation count (both counters restart from zero).
+	RemediationInProgressAnnotation = "controlplane.cluster.x-k8s.io/remediation-in-progress"
+
+	// RemediationForAnnotation is used to link a new machine to the unhealthy machine it is replacing;
+	// please note that in case of retry, when also the remediating machine fails, the system keeps track of
+	// the first machine of the sequence only.
+	// NOTE: if something external to CAPI removes this annotation the system this can lead to
+	// failures in updating remediation retry (the counter restarts from zero).
+	RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for"
+
+	// DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a
+	// machine unrelated from the previous remediation.
+	DefaultMinHealthyPeriod = 1 * time.Hour
 )
 
 // KubeadmControlPlaneSpec defines the desired state of KubeadmControlPlane.
@@ -91,6 +110,10 @@ type KubeadmControlPlaneSpec struct {
 	// +optional
 	// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
 	RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`
+
+	// The RemediationStrategy that controls how control plane machine remediation happens.
+	// +optional
+	RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
 }
 
 // KubeadmControlPlaneMachineTemplate defines the template for Machines
@@ -158,6 +181,50 @@ type RollingUpdate struct {
 	MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
 }
 
+// RemediationStrategy allows to define how control plane machine remediation happens.
+type RemediationStrategy struct {
+	// MaxRetry is the Max number of retries while attempting to remediate an unhealthy machine.
+	// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
+	// For example, given a control plane with three machines M1, M2, M3:
+	//
+	//	M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
+	//	If M1-1 (replacement of M1) has problems while bootstrapping it will become unhealthy, and then be
+	//	remediated; such operation is considered a retry, remediation-retry #1.
+	//	If M1-2 (replacement of M1-2) becomes unhealthy, remediation-retry #2 will happen, etc.
+	//
+	// A retry could happen only after RetryPeriod from the previous retry.
+	// If a machine is marked as unhealthy after MinHealthyPeriod from the previous remediation expired,
+	// this is not considered a retry anymore because the new issue is assumed unrelated from the previous one.
+	//
+	// If not set, the remedation will be retried infinitely.
+	// +optional
+	MaxRetry *int32 `json:"maxRetry,omitempty"`
+
+	// RetryPeriod is the duration that KCP should wait before remediating a machine being created as a replacement
+	// for an unhealthy machine (a retry).
+	//
+	// If not set, a retry will happen immediately.
+	// +optional
+	RetryPeriod metav1.Duration `json:"retryPeriod,omitempty"`
+
+	// MinHealthyPeriod defines the duration after which KCP will consider any failure to a machine unrelated
+	// from the previous one. In this case the remediation is not considered a retry anymore, and thus the retry
+	// counter restarts from 0. For example, assuming MinHealthyPeriod is set to 1h (default)
+	//
+	//	M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
+	//	If M1-1 (replacement of M1) has problems within the 1hr after the creation, also
+	//	this machine will be remediated and this operation is considered a retry - a problem related
+	//	to the original issue happened to M1 -.
+	//
+	//	If instead the problem on M1-1 is happening after MinHealthyPeriod expired, e.g. four days after
+	//	m1-1 has been created as a remediation of M1, the problem on M1-1 is considered unrelated to
+	//	the original issue happened to M1.
+	//
+	// If not set, this value is defaulted to 1h.
+	// +optional
+	MinHealthyPeriod *metav1.Duration `json:"minHealthyPeriod,omitempty"`
+}
+
 // KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.
 type KubeadmControlPlaneStatus struct {
 	// Selector is the label selector in string format to avoid introspection
@@ -223,6 +290,25 @@ type KubeadmControlPlaneStatus struct {
 	// Conditions defines current service state of the KubeadmControlPlane.
 	// +optional
 	Conditions clusterv1.Conditions `json:"conditions,omitempty"`
+
+	// LastRemediation stores info about last remediation performed.
+	// +optional
+	LastRemediation *LastRemediationStatus `json:"lastRemediation,omitempty"`
+}
+
+// LastRemediationStatus  stores info about last remediation performed.
+// NOTE: if for any reason information about last remediation are lost, RetryCount is going to restart from 0 and thus
+// more remediations than expected might happen.
+type LastRemediationStatus struct {
+	// Machine is the machine name of the latest machine being remediated.
+	Machine string `json:"machine"`
+
+	// Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
+	Timestamp metav1.Time `json:"timestamp"`
+
+	// RetryCount used to keep track of remediation retry for the last remediated machine.
+	// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
+	RetryCount int32 `json:"retryCount"`
 }
 
 // +kubebuilder:object:root=true
diff --git a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook.go b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook.go
index c17ac6049ce5..031115fb6dce 100644
--- a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook.go
+++ b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook.go
@@ -181,6 +181,8 @@ func (in *KubeadmControlPlane) ValidateUpdate(old runtime.Object) error {
 		{spec, "machineTemplate", "nodeDeletionTimeout"},
 		{spec, "replicas"},
 		{spec, "version"},
+		{spec, "remediationStrategy"},
+		{spec, "remediationStrategy", "*"},
 		{spec, "rolloutAfter"},
 		{spec, "rolloutBefore"},
 		{spec, "rolloutBefore", "*"},
diff --git a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook_test.go b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook_test.go
index ee20890ed807..cb6cdec40787 100644
--- a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook_test.go
+++ b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook_test.go
@@ -410,6 +410,11 @@ func TestKubeadmControlPlaneValidateUpdate(t *testing.T) {
 	validUpdate.Spec.RolloutBefore = &RolloutBefore{
 		CertificatesExpiryDays: pointer.Int32(14),
 	}
+	validUpdate.Spec.RemediationStrategy = &RemediationStrategy{
+		MaxRetry:         pointer.Int32(50),
+		MinHealthyPeriod: &metav1.Duration{Duration: 10 * time.Hour},
+		RetryPeriod:      metav1.Duration{Duration: 10 * time.Minute},
+	}
 	validUpdate.Spec.KubeadmConfigSpec.Format = bootstrapv1.CloudConfig
 
 	scaleToZero := before.DeepCopy()
diff --git a/controlplane/kubeadm/api/v1beta1/kubeadmcontrolplanetemplate_types.go b/controlplane/kubeadm/api/v1beta1/kubeadmcontrolplanetemplate_types.go
index 4817b71b18fb..9fab688e84f7 100644
--- a/controlplane/kubeadm/api/v1beta1/kubeadmcontrolplanetemplate_types.go
+++ b/controlplane/kubeadm/api/v1beta1/kubeadmcontrolplanetemplate_types.go
@@ -91,6 +91,10 @@ type KubeadmControlPlaneTemplateResourceSpec struct {
 	// +optional
 	// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
 	RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`
+
+	// The RemediationStrategy that controls how control plane machine remediation happens.
+	// +optional
+	RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
 }
 
 // KubeadmControlPlaneTemplateMachineTemplate defines the template for Machines
diff --git a/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go b/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go
index 27f9d2ecd98a..5d6d56bccd09 100644
--- a/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go
+++ b/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go
@@ -143,6 +143,11 @@ func (in *KubeadmControlPlaneSpec) DeepCopyInto(out *KubeadmControlPlaneSpec) {
 		*out = new(RolloutStrategy)
 		(*in).DeepCopyInto(*out)
 	}
+	if in.RemediationStrategy != nil {
+		in, out := &in.RemediationStrategy, &out.RemediationStrategy
+		*out = new(RemediationStrategy)
+		(*in).DeepCopyInto(*out)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeadmControlPlaneSpec.
@@ -175,6 +180,11 @@ func (in *KubeadmControlPlaneStatus) DeepCopyInto(out *KubeadmControlPlaneStatus
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
+	if in.LastRemediation != nil {
+		in, out := &in.LastRemediation, &out.LastRemediation
+		*out = new(LastRemediationStatus)
+		(*in).DeepCopyInto(*out)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeadmControlPlaneStatus.
@@ -314,6 +324,11 @@ func (in *KubeadmControlPlaneTemplateResourceSpec) DeepCopyInto(out *KubeadmCont
 		*out = new(RolloutStrategy)
 		(*in).DeepCopyInto(*out)
 	}
+	if in.RemediationStrategy != nil {
+		in, out := &in.RemediationStrategy, &out.RemediationStrategy
+		*out = new(RemediationStrategy)
+		(*in).DeepCopyInto(*out)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeadmControlPlaneTemplateResourceSpec.
@@ -342,6 +357,48 @@ func (in *KubeadmControlPlaneTemplateSpec) DeepCopy() *KubeadmControlPlaneTempla
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *LastRemediationStatus) DeepCopyInto(out *LastRemediationStatus) {
+	*out = *in
+	in.Timestamp.DeepCopyInto(&out.Timestamp)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LastRemediationStatus.
+func (in *LastRemediationStatus) DeepCopy() *LastRemediationStatus {
+	if in == nil {
+		return nil
+	}
+	out := new(LastRemediationStatus)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *RemediationStrategy) DeepCopyInto(out *RemediationStrategy) {
+	*out = *in
+	if in.MaxRetry != nil {
+		in, out := &in.MaxRetry, &out.MaxRetry
+		*out = new(int32)
+		**out = **in
+	}
+	out.RetryPeriod = in.RetryPeriod
+	if in.MinHealthyPeriod != nil {
+		in, out := &in.MinHealthyPeriod, &out.MinHealthyPeriod
+		*out = new(v1.Duration)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemediationStrategy.
+func (in *RemediationStrategy) DeepCopy() *RemediationStrategy {
+	if in == nil {
+		return nil
+	}
+	out := new(RemediationStrategy)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *RollingUpdate) DeepCopyInto(out *RollingUpdate) {
 	*out = *in
diff --git a/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml b/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml
index 2a8d52fb2c6e..52b83f8d68d7 100644
--- a/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml
+++ b/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanes.yaml
@@ -3620,6 +3620,51 @@ spec:
                 required:
                 - infrastructureRef
                 type: object
+              remediationStrategy:
+                description: The RemediationStrategy that controls how control plane
+                  machine remediation happens.
+                properties:
+                  maxRetry:
+                    description: "MaxRetry is the Max number of retries while attempting
+                      to remediate an unhealthy machine. A retry happens when a machine
+                      that was created as a replacement for an unhealthy machine also
+                      fails. For example, given a control plane with three machines
+                      M1, M2, M3: \n M1 become unhealthy; remediation happens, and
+                      M1-1 is created as a replacement. If M1-1 (replacement of M1)
+                      has problems while bootstrapping it will become unhealthy, and
+                      then be remediated; such operation is considered a retry, remediation-retry
+                      #1. If M1-2 (replacement of M1-2) becomes unhealthy, remediation-retry
+                      #2 will happen, etc. \n A retry could happen only after RetryPeriod
+                      from the previous retry. If a machine is marked as unhealthy
+                      after MinHealthyPeriod from the previous remediation expired,
+                      this is not considered a retry anymore because the new issue
+                      is assumed unrelated from the previous one. \n If not set, the
+                      remedation will be retried infinitely."
+                    format: int32
+                    type: integer
+                  minHealthyPeriod:
+                    description: "MinHealthyPeriod defines the duration after which
+                      KCP will consider any failure to a machine unrelated from the
+                      previous one. In this case the remediation is not considered
+                      a retry anymore, and thus the retry counter restarts from 0.
+                      For example, assuming MinHealthyPeriod is set to 1h (default)
+                      \n M1 become unhealthy; remediation happens, and M1-1 is created
+                      as a replacement. If M1-1 (replacement of M1) has problems within
+                      the 1hr after the creation, also this machine will be remediated
+                      and this operation is considered a retry - a problem related
+                      to the original issue happened to M1 -. \n If instead the problem
+                      on M1-1 is happening after MinHealthyPeriod expired, e.g. four
+                      days after m1-1 has been created as a remediation of M1, the
+                      problem on M1-1 is considered unrelated to the original issue
+                      happened to M1. \n If not set, this value is defaulted to 1h."
+                    type: string
+                  retryPeriod:
+                    description: "RetryPeriod is the duration that KCP should wait
+                      before remediating a machine being created as a replacement
+                      for an unhealthy machine (a retry). \n If not set, a retry will
+                      happen immediately."
+                    type: string
+                type: object
               replicas:
                 description: Number of desired machines. Defaults to 1. When stacked
                   etcd is used only odd numbers are permitted, as per [etcd best practice](https://etcd.io/docs/v3.3.12/faq/#why-an-odd-number-of-cluster-members).
@@ -3746,6 +3791,30 @@ spec:
                 description: Initialized denotes whether or not the control plane
                   has the uploaded kubeadm-config configmap.
                 type: boolean
+              lastRemediation:
+                description: LastRemediation stores info about last remediation performed.
+                properties:
+                  machine:
+                    description: Machine is the machine name of the latest machine
+                      being remediated.
+                    type: string
+                  retryCount:
+                    description: RetryCount used to keep track of remediation retry
+                      for the last remediated machine. A retry happens when a machine
+                      that was created as a replacement for an unhealthy machine also
+                      fails.
+                    format: int32
+                    type: integer
+                  timestamp:
+                    description: Timestamp is when last remediation happened. It is
+                      represented in RFC3339 form and is in UTC.
+                    format: date-time
+                    type: string
+                required:
+                - machine
+                - retryCount
+                - timestamp
+                type: object
               observedGeneration:
                 description: ObservedGeneration is the latest generation observed
                   by the controller.
diff --git a/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanetemplates.yaml b/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanetemplates.yaml
index 365bf50644cb..bca4dfebb4e7 100644
--- a/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanetemplates.yaml
+++ b/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanetemplates.yaml
@@ -2382,6 +2382,55 @@ spec:
                               time limitations.
                             type: string
                         type: object
+                      remediationStrategy:
+                        description: The RemediationStrategy that controls how control
+                          plane machine remediation happens.
+                        properties:
+                          maxRetry:
+                            description: "MaxRetry is the Max number of retries while
+                              attempting to remediate an unhealthy machine. A retry
+                              happens when a machine that was created as a replacement
+                              for an unhealthy machine also fails. For example, given
+                              a control plane with three machines M1, M2, M3: \n M1
+                              become unhealthy; remediation happens, and M1-1 is created
+                              as a replacement. If M1-1 (replacement of M1) has problems
+                              while bootstrapping it will become unhealthy, and then
+                              be remediated; such operation is considered a retry,
+                              remediation-retry #1. If M1-2 (replacement of M1-2)
+                              becomes unhealthy, remediation-retry #2 will happen,
+                              etc. \n A retry could happen only after RetryPeriod
+                              from the previous retry. If a machine is marked as unhealthy
+                              after MinHealthyPeriod from the previous remediation
+                              expired, this is not considered a retry anymore because
+                              the new issue is assumed unrelated from the previous
+                              one. \n If not set, the remedation will be retried infinitely."
+                            format: int32
+                            type: integer
+                          minHealthyPeriod:
+                            description: "MinHealthyPeriod defines the duration after
+                              which KCP will consider any failure to a machine unrelated
+                              from the previous one. In this case the remediation
+                              is not considered a retry anymore, and thus the retry
+                              counter restarts from 0. For example, assuming MinHealthyPeriod
+                              is set to 1h (default) \n M1 become unhealthy; remediation
+                              happens, and M1-1 is created as a replacement. If M1-1
+                              (replacement of M1) has problems within the 1hr after
+                              the creation, also this machine will be remediated and
+                              this operation is considered a retry - a problem related
+                              to the original issue happened to M1 -. \n If instead
+                              the problem on M1-1 is happening after MinHealthyPeriod
+                              expired, e.g. four days after m1-1 has been created
+                              as a remediation of M1, the problem on M1-1 is considered
+                              unrelated to the original issue happened to M1. \n If
+                              not set, this value is defaulted to 1h."
+                            type: string
+                          retryPeriod:
+                            description: "RetryPeriod is the duration that KCP should
+                              wait before remediating a machine being created as a
+                              replacement for an unhealthy machine (a retry). \n If
+                              not set, a retry will happen immediately."
+                            type: string
+                        type: object
                       rolloutAfter:
                         description: RolloutAfter is a field to indicate a rollout
                           should be performed after the specified time even if no
diff --git a/controlplane/kubeadm/internal/controllers/helpers.go b/controlplane/kubeadm/internal/controllers/helpers.go
index 9bd59c7b8dac..bae5e7918b2d 100644
--- a/controlplane/kubeadm/internal/controllers/helpers.go
+++ b/controlplane/kubeadm/internal/controllers/helpers.go
@@ -314,6 +314,13 @@ func (r *KubeadmControlPlaneReconciler) generateMachine(ctx context.Context, kcp
 		},
 	}
 
+	// In case this machine is being created as a consequence of a remediation, then add an annotation
+	// tracking remediating data.
+	// NOTE: This is required in order to track remediation retries.
+	if remediationData, ok := kcp.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
+		machine.Annotations[controlplanev1.RemediationForAnnotation] = remediationData
+	}
+
 	// Machine's bootstrap config may be missing ClusterConfiguration if it is not the first machine in the control plane.
 	// We store ClusterConfiguration as annotation here to detect any changes in KCP ClusterConfiguration and rollout the machine if any.
 	clusterConfig, err := json.Marshal(kcp.Spec.KubeadmConfigSpec.ClusterConfiguration)
@@ -331,5 +338,10 @@ func (r *KubeadmControlPlaneReconciler) generateMachine(ctx context.Context, kcp
 	if err := r.Client.Create(ctx, machine); err != nil {
 		return errors.Wrap(err, "failed to create machine")
 	}
+
+	// Remove the annotation tracking that a remediation is in progress (the remediation completed when
+	// the replacement machine has been created above).
+	delete(kcp.Annotations, controlplanev1.RemediationInProgressAnnotation)
+
 	return nil
 }
diff --git a/controlplane/kubeadm/internal/controllers/helpers_test.go b/controlplane/kubeadm/internal/controllers/helpers_test.go
index 53e699925434..39a76fdb6799 100644
--- a/controlplane/kubeadm/internal/controllers/helpers_test.go
+++ b/controlplane/kubeadm/internal/controllers/helpers_test.go
@@ -512,6 +512,9 @@ func TestKubeadmControlPlaneReconciler_generateMachine(t *testing.T) {
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      "testControlPlane",
 			Namespace: cluster.Namespace,
+			Annotations: map[string]string{
+				controlplanev1.RemediationInProgressAnnotation: "foo",
+			},
 		},
 		Spec: controlplanev1.KubeadmControlPlaneSpec{
 			Version: "v1.16.6",
@@ -562,6 +565,8 @@ func TestKubeadmControlPlaneReconciler_generateMachine(t *testing.T) {
 	g.Expect(machine.Namespace).To(Equal(kcp.Namespace))
 	g.Expect(machine.OwnerReferences).To(HaveLen(1))
 	g.Expect(machine.OwnerReferences).To(ContainElement(*metav1.NewControllerRef(kcp, controlplanev1.GroupVersion.WithKind("KubeadmControlPlane"))))
+	g.Expect(machine.Annotations).To(HaveKeyWithValue(controlplanev1.RemediationForAnnotation, "foo"))
+	g.Expect(kcp.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
 	g.Expect(machine.Spec).To(Equal(expectedMachineSpec))
 
 	// Verify that the machineTemplate.ObjectMeta has been propagated to the Machine.
diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go
index 429f8a2e954a..72a0eb8236aa 100644
--- a/controlplane/kubeadm/internal/controllers/remediation.go
+++ b/controlplane/kubeadm/internal/controllers/remediation.go
@@ -18,10 +18,14 @@ package controllers
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
+	"time"
 
 	"github.com/blang/semver"
+	"github.com/go-logr/logr"
 	"github.com/pkg/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	kerrors "k8s.io/apimachinery/pkg/util/errors"
 	"k8s.io/klog/v2"
 	ctrl "sigs.k8s.io/controller-runtime"
@@ -31,6 +35,7 @@ import (
 	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
 	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
 	"sigs.k8s.io/cluster-api/util"
+	"sigs.k8s.io/cluster-api/util/annotations"
 	"sigs.k8s.io/cluster-api/util/conditions"
 	"sigs.k8s.io/cluster-api/util/patch"
 )
@@ -39,6 +44,7 @@ import (
 // based on the process described in https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate
 func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *internal.ControlPlane) (ret ctrl.Result, retErr error) {
 	log := ctrl.LoggerFrom(ctx)
+	reconciliationTime := time.Now().UTC()
 
 	// Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1)
 	// if the underlying machine is now back to healthy / not deleting.
@@ -88,6 +94,16 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
 		return ctrl.Result{}, nil
 	}
 
+	log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized)
+
+	// Returns if another remediation is in progress but the new Machine is not yet created.
+	// Note: This condition is checked after we check for unhealthy Machines and if machineToBeRemediated
+	// is being deleted to avoid unnecessary logs if no further remediation should be done.
+	if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
+		log.Info("Another remediation is already in progress. Skipping remediation.")
+		return ctrl.Result{}, nil
+	}
+
 	patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
 	if err != nil {
 		return ctrl.Result{}, err
@@ -107,93 +123,220 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
 
 	// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
 	// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.
-	log.WithValues("Machine", klog.KObj(machineToBeRemediated))
-	desiredReplicas := int(*controlPlane.KCP.Spec.Replicas)
 
-	// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
-	if controlPlane.Machines.Len() <= 1 {
-		log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len())
-		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
-		return ctrl.Result{}, nil
+	// Check if KCP is allowed to remediate considering retry limits:
+	// - Remediation cannot happen because retryPeriod is not yet expired.
+	// - KCP already reached MaxRetries limit.
+	remediationInProgressData, canRemediate, err := r.checkRetryLimits(log, machineToBeRemediated, controlPlane, reconciliationTime)
+	if err != nil {
+		return ctrl.Result{}, err
 	}
-
-	// The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster
-	// is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first.
-	if controlPlane.Machines.Len() < desiredReplicas {
-		log.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "Replicas", desiredReplicas, "CurrentReplicas", controlPlane.Machines.Len())
-		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least %d control plane machines before triggering remediation", desiredReplicas)
+	if !canRemediate {
+		// NOTE: log lines and conditions surfacing why it is not possible to remediate are set by checkRetryLimits.
 		return ctrl.Result{}, nil
 	}
 
-	// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
-	if controlPlane.HasDeletingMachine() {
-		log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation")
-		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
-		return ctrl.Result{}, nil
-	}
+	if controlPlane.KCP.Status.Initialized {
+		// Executes checks that apply only if the control plane is already initialized; in this case KCP can
+		// remediate only if it can safely assume that the operation preserves the operation state of the
+		// existing cluster (or at least it doesn't make it worse).
 
-	// Remediation MUST preserve etcd quorum. This rule ensures that we will not remove a member that would result in etcd
-	// losing a majority of members and thus become unable to field new requests.
-	if controlPlane.IsEtcdManaged() {
-		canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated)
-		if err != nil {
-			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
-			return ctrl.Result{}, err
-		}
-		if !canSafelyRemediate {
-			log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation")
-			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
+		// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
+		if controlPlane.Machines.Len() <= 1 {
+			log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len())
+			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
 			return ctrl.Result{}, nil
 		}
-	}
-
-	workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster))
-	if err != nil {
-		log.Error(err, "Failed to create client to workload cluster")
-		return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
-	}
 
-	// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
-	if controlPlane.IsEtcdManaged() {
-		etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
-		if etcdLeaderCandidate == nil {
-			log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
-			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
-				"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
+		// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
+		if controlPlane.HasDeletingMachine() {
+			log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation")
+			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
 			return ctrl.Result{}, nil
 		}
-		if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
-			log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate))
-			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
-			return ctrl.Result{}, err
+
+		// Remediation MUST preserve etcd quorum. This rule ensures that KCP will not remove a member that would result in etcd
+		// losing a majority of members and thus become unable to field new requests.
+		if controlPlane.IsEtcdManaged() {
+			canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated)
+			if err != nil {
+				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
+				return ctrl.Result{}, err
+			}
+			if !canSafelyRemediate {
+				log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation")
+				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
+				return ctrl.Result{}, nil
+			}
 		}
-		if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
-			log.Error(err, "Failed to remove etcd member for machine")
-			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
-			return ctrl.Result{}, err
+
+		// Start remediating the unhealthy control plane machine by deleting it.
+		// A new machine will come up completing the operation as part of the regular reconcile.
+
+		// If the control plane is initialized, before deleting the machine:
+		// - if the machine hosts the etcd leader, forward etcd leadership to another machine.
+		// - delete the etcd member hosted on the machine being deleted.
+		// - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0)
+		workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster))
+		if err != nil {
+			log.Error(err, "Failed to create client to workload cluster")
+			return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
 		}
-	}
 
-	parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version)
-	if err != nil {
-		return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version)
-	}
+		// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
+		if controlPlane.IsEtcdManaged() {
+			etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
+			if etcdLeaderCandidate == nil {
+				log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
+				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
+					"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
+				return ctrl.Result{}, nil
+			}
+			if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
+				log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate))
+				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
+				return ctrl.Result{}, err
+			}
+			if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
+				log.Error(err, "Failed to remove etcd member for machine")
+				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
+				return ctrl.Result{}, err
+			}
+		}
 
-	if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated, parsedVersion); err != nil {
-		log.Error(err, "Failed to remove machine from kubeadm ConfigMap")
-		return ctrl.Result{}, err
+		parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version)
+		if err != nil {
+			return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version)
+		}
+
+		if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated, parsedVersion); err != nil {
+			log.Error(err, "Failed to remove machine from kubeadm ConfigMap")
+			return ctrl.Result{}, err
+		}
 	}
 
+	// Delete the machine
 	if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil {
 		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
 		return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
 	}
 
+	// Surface the operation is in progress.
 	log.Info("Remediating unhealthy machine")
 	conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+	// Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation.
+	remediationInProgressValue, err := remediationInProgressData.Marshal()
+	if err != nil {
+		return ctrl.Result{}, err
+	}
+
+	// Set annotations tracking remediation details so they can be picked up by the machine
+	// that will be created as part of the scale up action that completes the remediation.
+	annotations.AddAnnotations(controlPlane.KCP, map[string]string{
+		controlplanev1.RemediationInProgressAnnotation: remediationInProgressValue,
+	})
+
 	return ctrl.Result{Requeue: true}, nil
 }
 
+// checkRetryLimits checks if KCP is allowed to remediate considering retry limits:
+// - Remediation cannot happen because retryPeriod is not yet expired.
+// - KCP already reached the maximum number of retries for a machine.
+// NOTE: Counting the number of retries is required In order to prevent infinite remediation e.g. in case the
+// first Control Plane machine is failing due to quota issue.
+func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machineToBeRemediated *clusterv1.Machine, controlPlane *internal.ControlPlane, reconciliationTime time.Time) (*RemediationData, bool, error) {
+	// Get last remediation info from the machine.
+	var lastRemediationData *RemediationData
+	if value, ok := machineToBeRemediated.Annotations[controlplanev1.RemediationForAnnotation]; ok {
+		l, err := RemediationDataFromAnnotation(value)
+		if err != nil {
+			return nil, false, err
+		}
+		lastRemediationData = l
+	}
+
+	remediationInProgressData := &RemediationData{
+		Machine:    machineToBeRemediated.Name,
+		Timestamp:  metav1.Time{Time: reconciliationTime},
+		RetryCount: 0,
+	}
+
+	// If there is no last remediation, this is the first try of a new retry sequence.
+	if lastRemediationData == nil {
+		return remediationInProgressData, true, nil
+	}
+
+	// Gets MinHealthyPeriod and RetryPeriod from the remediation strategy, or use defaults.
+	minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod
+	if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod != nil {
+		minHealthyPeriod = controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration
+	}
+	retryPeriod := time.Duration(0)
+	if controlPlane.KCP.Spec.RemediationStrategy != nil {
+		retryPeriod = controlPlane.KCP.Spec.RemediationStrategy.RetryPeriod.Duration
+	}
+
+	// Gets the timestamp of the last remediation; if missing, default to a value
+	// that ensures both MinHealthyPeriod and RetryPeriod are expired.
+	// NOTE: this could potentially lead to executing more retries than expected or to executing retries before than
+	// expected, but this is considered acceptable when the system recovers from someone/something changes or deletes
+	// the RemediationForAnnotation on Machines.
+	lastRemediationTime := reconciliationTime.Add(-2 * max(minHealthyPeriod, retryPeriod))
+	if !lastRemediationData.Timestamp.IsZero() {
+		lastRemediationTime = lastRemediationData.Timestamp.Time
+	}
+
+	// Once we get here we already know that there was a last remediation for the Machine.
+	// If the current remediation is happening before minHealthyPeriod is expired, then KCP considers this
+	// as a remediation for the same previously unhealthy machine.
+	// NOTE: If someone/something changes the RemediationForAnnotation on Machines (e.g. changes the Timestamp),
+	// this could potentially lead to executing more retries than expected, but this is considered acceptable in such a case.
+	var retryForSameMachineInProgress bool
+	if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) {
+		retryForSameMachineInProgress = true
+		log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, lastRemediationData.Machine))
+	}
+
+	// If the retry for the same machine is not in progress, this is the first try of a new retry sequence.
+	if !retryForSameMachineInProgress {
+		return remediationInProgressData, true, nil
+	}
+
+	// If the remediation is for the same machine, carry over the retry count.
+	remediationInProgressData.RetryCount = lastRemediationData.RetryCount
+
+	// Check if remediation can happen because retryPeriod is passed.
+	if lastRemediationTime.Add(retryPeriod).After(reconciliationTime) {
+		log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation", retryPeriod))
+		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryPeriod)", retryPeriod)
+		return remediationInProgressData, false, nil
+	}
+
+	// Check if remediation can happen because of maxRetry is not reached yet, if defined.
+	if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MaxRetry != nil {
+		maxRetry := int(*controlPlane.KCP.Spec.RemediationStrategy.MaxRetry)
+		if remediationInProgressData.RetryCount >= maxRetry {
+			log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed %d times (MaxRetry %d). Skipping remediation", remediationInProgressData.RetryCount, maxRetry))
+			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed %d times (MaxRetry)", maxRetry)
+			return remediationInProgressData, false, nil
+		}
+	}
+
+	// All the check passed, increase the remediation retry count.
+	remediationInProgressData.RetryCount++
+
+	return remediationInProgressData, true, nil
+}
+
+// max calculates the maximum duration.
+func max(x, y time.Duration) time.Duration {
+	if x < y {
+		return y
+	}
+	return x
+}
+
 // canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
 // without loosing etcd quorum.
 //
@@ -208,7 +351,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
 //   - etc.
 //
 // NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers
-// ans well as reconcileControlPlaneConditions before this.
+// as well as reconcileControlPlaneConditions before this.
 func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) {
 	log := ctrl.LoggerFrom(ctx)
 
@@ -258,10 +401,10 @@ func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Co
 			}
 		}
 
-		// If an etcd member does not have a corresponding machine, it is not possible to retrieve etcd member health
-		// so we are assuming the worst scenario and considering the member unhealthy.
+		// If an etcd member does not have a corresponding machine it is not possible to retrieve etcd member health,
+		// so KCP is assuming the worst scenario and considering the member unhealthy.
 		//
-		// NOTE: This should not happen given that we are running reconcileEtcdMembers before calling this method.
+		// NOTE: This should not happen given that KCP is running reconcileEtcdMembers before calling this method.
 		if machine == nil {
 			log.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "MemberName", etcdMember)
 			targetUnhealthyMembers++
@@ -293,3 +436,44 @@ func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Co
 
 	return canSafelyRemediate, nil
 }
+
+// RemediationData struct is used to keep track of information stored in the RemediationInProgressAnnotation in KCP
+// during remediation and then into the RemediationForAnnotation on the replacement machine once it is created.
+type RemediationData struct {
+	// Machine is the machine name of the latest machine being remediated.
+	Machine string `json:"machine"`
+
+	// Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
+	Timestamp metav1.Time `json:"timestamp"`
+
+	// RetryCount used to keep track of remediation retry for the last remediated machine.
+	// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
+	RetryCount int `json:"retryCount"`
+}
+
+// RemediationDataFromAnnotation gets RemediationData from an annotation value.
+func RemediationDataFromAnnotation(value string) (*RemediationData, error) {
+	ret := &RemediationData{}
+	if err := json.Unmarshal([]byte(value), ret); err != nil {
+		return nil, errors.Wrapf(err, "failed to unmarshal value %s for %s annotation", value, clusterv1.RemediationInProgressReason)
+	}
+	return ret, nil
+}
+
+// Marshal an RemediationData into an annotation value.
+func (r *RemediationData) Marshal() (string, error) {
+	b, err := json.Marshal(r)
+	if err != nil {
+		return "", errors.Wrapf(err, "failed to marshal value for %s annotation", clusterv1.RemediationInProgressReason)
+	}
+	return string(b), nil
+}
+
+// ToStatus converts a RemediationData into a LastRemediationStatus struct.
+func (r *RemediationData) ToStatus() *controlplanev1.LastRemediationStatus {
+	return &controlplanev1.LastRemediationStatus{
+		Machine:    r.Machine,
+		Timestamp:  r.Timestamp,
+		RetryCount: int32(r.RetryCount),
+	}
+}
diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go
index e6c13e1691b3..26183baa1cc7 100644
--- a/controlplane/kubeadm/internal/controllers/remediation_test.go
+++ b/controlplane/kubeadm/internal/controllers/remediation_test.go
@@ -42,7 +42,7 @@ import (
 
 func TestReconcileUnhealthyMachines(t *testing.T) {
 	g := NewWithT(t)
-	ctx := context.TODO()
+
 	r := &KubeadmControlPlaneReconciler{
 		Client:   env.GetClient(),
 		recorder: record.NewFakeRecorder(32),
@@ -53,7 +53,14 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 		g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
 	}()
 
-	t.Run("Remediation cleans up stuck remediation on previously unhealthy machines", func(t *testing.T) {
+	var removeFinalizer = func(g *WithT, m *clusterv1.Machine) {
+		patchHelper, err := patch.NewHelper(m, env.GetClient())
+		g.Expect(err).ToNot(HaveOccurred())
+		m.ObjectMeta.Finalizers = nil
+		g.Expect(patchHelper.Patch(ctx, m))
+	}
+
+	t.Run("It cleans up stuck remediation on previously unhealthy machines", func(t *testing.T) {
 		g := NewWithT(t)
 
 		m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation())
@@ -63,7 +70,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			Cluster:  &clusterv1.Cluster{},
 			Machines: collections.FromMachines(m),
 		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
 		g.Expect(err).ToNot(HaveOccurred())
@@ -79,6 +86,10 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			return errors.Errorf("condition %s still exists", clusterv1.MachineOwnerRemediatedCondition)
 		}, 10*time.Second).Should(Succeed())
 	})
+
+	// Generic preflight checks
+	// Those are ore flight checks that happen no matter if the control plane has been already initialized or not.
+
 	t.Run("Remediation does not happen if there are no unhealthy machines", func(t *testing.T) {
 		g := NewWithT(t)
 
@@ -87,12 +98,38 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			Cluster:  &clusterv1.Cluster{},
 			Machines: collections.New(),
 		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
 		g.Expect(err).ToNot(HaveOccurred())
 	})
-	t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is marked for deletion", func(t *testing.T) {
+	t.Run("reconcileUnhealthyMachines return early if another remediation is in progress", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation())
+		conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "")
+		conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				ObjectMeta: metav1.ObjectMeta{
+					Annotations: map[string]string{
+						controlplanev1.RemediationInProgressAnnotation: MustMarshalRemediationData(&RemediationData{
+							Machine:    "foo",
+							Timestamp:  metav1.Time{Time: time.Now().UTC()},
+							RetryCount: 0,
+						}),
+					},
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m),
+		}
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+	})
+	t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is already being deleted", func(t *testing.T) {
 		g := NewWithT(t)
 
 		m := getDeletingMachine(ns.Name, "m1-unhealthy-deleting-", withMachineHealthCheckFailed())
@@ -103,92 +140,644 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			Cluster:  &clusterv1.Cluster{},
 			Machines: collections.FromMachines(m),
 		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+	})
+	t.Run("Remediation does not happen if MaxRetry is reached", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
+			Machine:    "m0",
+			Timestamp:  metav1.Time{Time: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).UTC()}, // minHealthy not expired yet.
+			RetryCount: 3,
+		})))
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RemediationStrategy: &controlplanev1.RemediationStrategy{
+						MaxRetry: utilpointer.Int32(3),
+					},
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
 		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed 3 times (MaxRetry)")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
 	})
+	t.Run("Retry history is ignored if min healthy period is expired, default min healthy period", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
+			Machine:    "m0",
+			Timestamp:  metav1.Time{Time: time.Now().Add(-2 * controlplanev1.DefaultMinHealthyPeriod).UTC()}, // minHealthyPeriod already expired.
+			RetryCount: 3,
+		})))
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RemediationStrategy: &controlplanev1.RemediationStrategy{
+						MaxRetry: utilpointer.Int32(3),
+					},
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m1.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(0))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+	})
+	t.Run("Retry history is ignored if min healthy period is expired", func(t *testing.T) {
+		g := NewWithT(t)
+
+		minHealthyPeriod := 4 * controlplanev1.DefaultMinHealthyPeriod // big min healthy period, so we are user that we are not using DefaultMinHealthyPeriod.
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
+			Machine:    "m0",
+			Timestamp:  metav1.Time{Time: time.Now().Add(-2 * minHealthyPeriod).UTC()}, // minHealthyPeriod already expired.
+			RetryCount: 3,
+		})))
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RemediationStrategy: &controlplanev1.RemediationStrategy{
+						MaxRetry:         utilpointer.Int32(3),
+						MinHealthyPeriod: &metav1.Duration{Duration: minHealthyPeriod},
+					},
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m1.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(0))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+	})
+	t.Run("Remediation does not happen if RetryPeriod is not yet passed", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
+			Machine:    "m0",
+			Timestamp:  metav1.Time{Time: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).UTC()}, // minHealthyPeriod not yet expired.
+			RetryCount: 2,
+		})))
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RemediationStrategy: &controlplanev1.RemediationStrategy{
+						MaxRetry:    utilpointer.Int32(3),
+						RetryPeriod: metav1.Duration{Duration: controlplanev1.DefaultMinHealthyPeriod}, // RetryPeriod not yet expired.
+					},
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest 1h0m0s (RetryPeriod)")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+	})
+
+	// There are no preflight checks for when control plane is not yet initialized
+	// (it is the first CP, we can nuke it).
+
+	// Preflight checks for when control plane is already initialized.
+
 	t.Run("Remediation does not happen if desired replicas <= 1", func(t *testing.T) {
 		g := NewWithT(t)
 
 		m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(1),
-				RolloutStrategy: &controlplanev1.RolloutStrategy{
-					RollingUpdate: &controlplanev1.RollingUpdate{
-						MaxSurge: &intstr.IntOrString{
-							IntVal: 1,
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(1),
+					RolloutStrategy: &controlplanev1.RolloutStrategy{
+						RollingUpdate: &controlplanev1.RollingUpdate{
+							MaxSurge: &intstr.IntOrString{
+								IntVal: 1,
+							},
 						},
 					},
 				},
-			}},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
 			Machines: collections.FromMachines(m),
 		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
 		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+
 		assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
 
 		g.Expect(env.Cleanup(ctx, m)).To(Succeed())
 	})
-	t.Run("Remediation does not happen if number of machines lower than desired", func(t *testing.T) {
+	t.Run("Remediation does not happen if there is another machine being deleted (not the one to be remediated)", func(t *testing.T) {
 		g := NewWithT(t)
 
 		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
 		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-")
+		m3 := getDeletingMachine(ns.Name, "m3-deleting") // NB. This machine is not created, it gets only added to control plane
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas:        utilpointer.Int32(3),
-				RolloutStrategy: &controlplanev1.RolloutStrategy{},
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
+
+		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
+	})
+	t.Run("Remediation does not happen if there is at least one additional unhealthy etcd member on a 3 machine CP", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
+		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
+
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+	})
+	t.Run("Remediation does not happen if there are at least two additional unhealthy etcd member on a 5 machine CP", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
+		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
+		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
+		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(5),
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1, m2, m3, m4, m5),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
+
+		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
+	})
+
+	// Remediation for when control plane is not yet initialized
+
+	t.Run("Remediation deletes unhealthy machine - 1 CP not initialized", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(1),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: false,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1),
+		}
+
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m1.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(0))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
+	})
+	t.Run("Subsequent remediation of the same machine increase retry count - 1 CP not initialized", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(1),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: false,
+				},
+			},
+			Cluster:  &clusterv1.Cluster{},
+			Machines: collections.FromMachines(m1),
+		}
+
+		// First reconcile, remediate machine m1 for the first time
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m1.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(0))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed())
+
+		for i := 2; i < 4; i++ {
+			// Simulate the creation of a replacement for 0.
+			mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
+
+			// Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine.
+			delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
+
+			controlPlane.Machines = collections.FromMachines(mi)
+
+			// Reconcile unhealthy replacements for m1.
+			r.managementCluster = &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(collections.FromMachines(mi)),
+				},
+			}
+			ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+			g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+			g.Expect(err).ToNot(HaveOccurred())
+
+			g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+			remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+			g.Expect(err).ToNot(HaveOccurred())
+			g.Expect(remediationData.Machine).To(Equal(mi.Name))
+			g.Expect(remediationData.RetryCount).To(Equal(i - 1))
+
+			assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+			err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi)
+			g.Expect(err).ToNot(HaveOccurred())
+			g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+			removeFinalizer(g, mi)
+			g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed())
+		}
+	})
+
+	// Remediation for when control plane is already initialized
+
+	t.Run("Remediation deletes unhealthy machine - 2 CP (during 1 CP rolling upgrade)", func(t *testing.T) {
+		g := NewWithT(t)
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+
+		controlPlane := &internal.ControlPlane{
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(2),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
 			Machines: collections.FromMachines(m1, m2),
 		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
 
-		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
 		g.Expect(err).ToNot(HaveOccurred())
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least 3 control plane machines before triggering remediation")
+		g.Expect(remediationData.Machine).To(Equal(m1.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(0))
 
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
 		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
 	})
-	t.Run("Remediation does not happen if there is a deleting machine", func(t *testing.T) {
+	t.Run("Remediation deletes unhealthy machine - 3 CP", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
-		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-")
-		m3 := getDeletingMachine(ns.Name, "m3-deleting") // NB. This machine is not created, it gets only added to control plane
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(3),
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
 			Machines: collections.FromMachines(m1, m2, m3),
 		}
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
 
-		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		r := &KubeadmControlPlaneReconciler{
+			Client:   env.GetClient(),
+			recorder: record.NewFakeRecorder(32),
+			managementCluster: &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(controlPlane.Machines),
+				},
+			},
+		}
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
 
-		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m1.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(0))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
 	})
-	t.Run("Remediation does not happen if there is at least one additional unhealthy etcd member on a 3 machine CP", func(t *testing.T) {
+	t.Run("Remediation deletes unhealthy machine - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
-		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
-		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(3),
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(4),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2, m3),
+			Machines: collections.FromMachines(m1, m2, m3, m4),
 		}
 
 		r := &KubeadmControlPlaneReconciler{
@@ -201,29 +790,46 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			},
 		}
 
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
-		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
 
-		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m1.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(0))
+
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
 	})
-	t.Run("Remediation does not happen if there is at least two additional unhealthy etcd member on a 5 machine CP", func(t *testing.T) {
+	t.Run("Remediation fails gracefully if no healthy Control Planes are available to become etcd leader", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
-		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
-		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
-		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
-		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
+		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(5),
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(4),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2, m3, m4, m5),
+			Machines: collections.FromMachines(m1, m2, m3, m4),
 		}
 
 		r := &KubeadmControlPlaneReconciler{
@@ -235,35 +841,39 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 				},
 			},
 		}
+		_, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
+		g.Expect(err).ToNot(HaveOccurred())
 
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
 
-		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
-		g.Expect(err).ToNot(HaveOccurred())
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
+		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
+			"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
 
-		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
 	})
-	t.Run("Remediation deletes unhealthy machine - 2 CP (during 1 CP rolling upgrade)", func(t *testing.T) {
+	t.Run("Subsequent remediation of the same machine increase retry count - 3 CP", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
-		patchHelper, err := patch.NewHelper(m1, env.GetClient())
-		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
-		g.Expect(patchHelper.Patch(ctx, m1))
-
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
 		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(2),
-				Version:  "v1.19.1",
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(1),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: false,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2),
+			Machines: collections.FromMachines(m1, m2, m3),
 		}
 
+		// First reconcile, remediate machine m1 for the first time
 		r := &KubeadmControlPlaneReconciler{
 			Client:   env.GetClient(),
 			recorder: record.NewFakeRecorder(32),
@@ -274,43 +884,99 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			},
 		}
 
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
 
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m1.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(0))
+
 		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
 
 		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
 		g.Expect(err).ToNot(HaveOccurred())
 		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
 
-		patchHelper, err = patch.NewHelper(m1, env.GetClient())
-		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = nil
-		g.Expect(patchHelper.Patch(ctx, m1))
+		removeFinalizer(g, m1)
+		g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed())
 
-		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
+		for i := 5; i < 6; i++ {
+			// Simulate the creation of a replacement for m1.
+			mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
+
+			// Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine.
+			delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
+			controlPlane.Machines = collections.FromMachines(mi, m2, m3)
+
+			// Reconcile unhealthy replacements for m1.
+			r.managementCluster = &fakeManagementCluster{
+				Workload: fakeWorkloadCluster{
+					EtcdMembersResult: nodes(collections.FromMachines(mi, m2, m3)),
+				},
+			}
+
+			ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+			g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
+			g.Expect(err).ToNot(HaveOccurred())
+
+			g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+			remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+			g.Expect(err).ToNot(HaveOccurred())
+			g.Expect(remediationData.Machine).To(Equal(mi.Name))
+			g.Expect(remediationData.RetryCount).To(Equal(i - 4))
+
+			assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+			err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi)
+			g.Expect(err).ToNot(HaveOccurred())
+			g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+			removeFinalizer(g, mi)
+			g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed())
+		}
+
+		g.Expect(env.CleanupAndWait(ctx, m2, m3)).To(Succeed())
 	})
-	t.Run("Remediation deletes unhealthy machine - 3 CP", func(t *testing.T) {
+}
+
+func TestReconcileUnhealthyMachinesSequences(t *testing.T) {
+	var removeFinalizer = func(g *WithT, m *clusterv1.Machine) {
+		patchHelper, err := patch.NewHelper(m, env.GetClient())
+		g.Expect(err).ToNot(HaveOccurred())
+		m.ObjectMeta.Finalizers = nil
+		g.Expect(patchHelper.Patch(ctx, m))
+	}
+
+	t.Run("Remediates the first CP machine having problems to come up", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
-		patchHelper, err := patch.NewHelper(m1, env.GetClient())
+		ns, err := env.CreateNamespace(ctx, "ns1")
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
-		g.Expect(patchHelper.Patch(ctx, m1))
+		defer func() {
+			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
+		}()
 
-		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
-		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
+		// Control plane not initialized yet, First CP is unhealthy and gets remediated:
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(3),
-				Version:  "v1.19.1",
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: false,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2, m3),
+			Machines: collections.FromMachines(m1),
 		}
 
 		r := &KubeadmControlPlaneReconciler{
@@ -323,44 +989,103 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			},
 		}
 
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
 
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m1.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(0))
+
 		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
 
 		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
 		g.Expect(err).ToNot(HaveOccurred())
 		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
 
-		patchHelper, err = patch.NewHelper(m1, env.GetClient())
+		removeFinalizer(g, m1)
+		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
+
+		// Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy.
+		// NOTE: scale up also resets remediation in progress and remediation counts.
+
+		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
+		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
+
+		// Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2)
+
+		controlPlane.Machines = collections.FromMachines(m2)
+		r.managementCluster = &fakeManagementCluster{
+			Workload: fakeWorkloadCluster{
+				EtcdMembersResult: nodes(controlPlane.Machines),
+			},
+		}
+
+		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = nil
-		g.Expect(patchHelper.Patch(ctx, m1))
 
-		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m2.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(1))
+
+		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m1)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m2)
+		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
+
+		// Fake scaling up, which creates a remediation machine, which is healthy.
+		// NOTE: scale up also resets remediation in progress and remediation counts.
+
+		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
+		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
+
+		g.Expect(env.Cleanup(ctx, m3)).To(Succeed())
 	})
-	t.Run("Remediation deletes unhealthy machine - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) {
+
+	t.Run("Remediates the second CP machine having problems to come up", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
-		patchHelper, err := patch.NewHelper(m1, env.GetClient())
+		ns, err := env.CreateNamespace(ctx, "ns1")
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
-		g.Expect(patchHelper.Patch(ctx, m1))
+		defer func() {
+			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
+		}()
 
-		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
-		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
-		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember())
+		// Control plane initialized yet, First CP healthy, second CP is unhealthy and gets remediated:
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember())
+		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(4),
-				Version:  "v1.19.1",
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RolloutStrategy: &controlplanev1.RolloutStrategy{
+						RollingUpdate: &controlplanev1.RollingUpdate{
+							MaxSurge: &intstr.IntOrString{
+								IntVal: 1,
+							},
+						},
+					},
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2, m3, m4),
+			Machines: collections.FromMachines(m1, m2),
 		}
 
 		r := &KubeadmControlPlaneReconciler{
@@ -373,44 +1098,104 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 			},
 		}
 
-		ret, err := r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
 
 		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
 
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m2.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(0))
 
-		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
+		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2)
 		g.Expect(err).ToNot(HaveOccurred())
-		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m2)
+		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
+
+		// Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy.
+		// NOTE: scale up also resets remediation in progress and remediation counts.
+
+		m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
+		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
+
+		// Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2)
+
+		controlPlane.Machines = collections.FromMachines(m1, m3)
+		r.managementCluster = &fakeManagementCluster{
+			Workload: fakeWorkloadCluster{
+				EtcdMembersResult: nodes(controlPlane.Machines),
+			},
+		}
 
-		patchHelper, err = patch.NewHelper(m1, env.GetClient())
+		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = nil
-		g.Expect(patchHelper.Patch(ctx, m1))
 
-		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m3.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(1))
+
+		assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m3.Namespace, Name: m3.Name}, m3)
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(m3.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
+
+		removeFinalizer(g, m3)
+		g.Expect(env.Cleanup(ctx, m3)).To(Succeed())
+
+		// Fake scaling up, which creates a remediation machine, which is healthy.
+		// NOTE: scale up also resets remediation in progress and remediation counts.
+
+		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
+		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
+
+		g.Expect(env.Cleanup(ctx, m1, m4)).To(Succeed())
 	})
-	t.Run("Remediation does not happen if no healthy Control Planes are available to become etcd leader", func(t *testing.T) {
+
+	t.Run("Remediates only one CP machine in case of multiple failures", func(t *testing.T) {
 		g := NewWithT(t)
 
-		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
-		patchHelper, err := patch.NewHelper(m1, env.GetClient())
+		ns, err := env.CreateNamespace(ctx, "ns1")
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
-		g.Expect(patchHelper.Patch(ctx, m1))
+		defer func() {
+			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
+		}()
 
-		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
-		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
-		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
+		// Control plane initialized yet, First CP healthy, second and third CP are unhealthy. second gets remediated:
+
+		m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember())
+		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
+		m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
 
 		controlPlane := &internal.ControlPlane{
-			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
-				Replicas: utilpointer.Int32(4),
-				Version:  "v1.19.1",
-			}},
+			KCP: &controlplanev1.KubeadmControlPlane{
+				Spec: controlplanev1.KubeadmControlPlaneSpec{
+					Replicas: utilpointer.Int32(3),
+					Version:  "v1.19.1",
+					RolloutStrategy: &controlplanev1.RolloutStrategy{
+						RollingUpdate: &controlplanev1.RollingUpdate{
+							MaxSurge: &intstr.IntOrString{
+								IntVal: 1,
+							},
+						},
+					},
+				},
+				Status: controlplanev1.KubeadmControlPlaneStatus{
+					Initialized: true,
+				},
+			},
 			Cluster:  &clusterv1.Cluster{},
-			Machines: collections.FromMachines(m1, m2, m3, m4),
+			Machines: collections.FromMachines(m1, m2, m3),
 		}
 
 		r := &KubeadmControlPlaneReconciler{
@@ -422,24 +1207,48 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
 				},
 			},
 		}
-		_, err = r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
+
+		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
 		g.Expect(err).ToNot(HaveOccurred())
 
-		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
-			"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
+		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
+		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
+		g.Expect(err).ToNot(HaveOccurred())
+		g.Expect(remediationData.Machine).To(Equal(m2.Name))
+		g.Expect(remediationData.RetryCount).To(Equal(0))
 
-		patchHelper, err = patch.NewHelper(m1, env.GetClient())
+		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
+		assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
+
+		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2)
 		g.Expect(err).ToNot(HaveOccurred())
-		m1.ObjectMeta.Finalizers = nil
-		g.Expect(patchHelper.Patch(ctx, m1))
+		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
 
-		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
+		removeFinalizer(g, m2)
+		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
+
+		// Check next reconcile does not further remediate
+
+		controlPlane.Machines = collections.FromMachines(m1, m3)
+		r.managementCluster = &fakeManagementCluster{
+			Workload: fakeWorkloadCluster{
+				EtcdMembersResult: nodes(controlPlane.Machines),
+			},
+		}
+
+		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
+
+		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
+		g.Expect(err).ToNot(HaveOccurred())
+
+		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
 	})
 }
 
 func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 	g := NewWithT(t)
-	ctx := context.TODO()
 
 	ns, err := env.CreateNamespace(ctx, "ns1")
 	g.Expect(err).ToNot(HaveOccurred())
@@ -470,7 +1279,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeFalse())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -501,7 +1310,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -538,7 +1347,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -568,7 +1377,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeFalse())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -599,7 +1408,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -637,7 +1446,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -668,7 +1477,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeFalse())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -701,7 +1510,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -734,7 +1543,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeFalse())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -769,7 +1578,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeTrue())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -804,7 +1613,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
 			},
 		}
 
-		ret, err := r.canSafelyRemoveEtcdMember(context.TODO(), controlPlane, m1)
+		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
 		g.Expect(ret).To(BeFalse())
 		g.Expect(err).ToNot(HaveOccurred())
 
@@ -859,6 +1668,21 @@ func withNodeRef(ref string) machineOption {
 	}
 }
 
+func withRemediateForAnnotation(remediatedFor string) machineOption {
+	return func(machine *clusterv1.Machine) {
+		if machine.Annotations == nil {
+			machine.Annotations = map[string]string{}
+		}
+		machine.Annotations[controlplanev1.RemediationForAnnotation] = remediatedFor
+	}
+}
+
+func withWaitBeforeDeleteFinalizer() machineOption {
+	return func(machine *clusterv1.Machine) {
+		machine.Finalizers = []string{"wait-before-delete"}
+	}
+}
+
 func createMachine(ctx context.Context, g *WithT, namespace, name string, options ...machineOption) *clusterv1.Machine {
 	m := &clusterv1.Machine{
 		ObjectMeta: metav1.ObjectMeta{
@@ -931,3 +1755,11 @@ func assertMachineCondition(ctx context.Context, g *WithT, m *clusterv1.Machine,
 		return nil
 	}, 10*time.Second).Should(Succeed())
 }
+
+func MustMarshalRemediationData(r *RemediationData) string {
+	s, err := r.Marshal()
+	if err != nil {
+		panic("failed to marshal remediation data")
+	}
+	return s
+}
diff --git a/controlplane/kubeadm/internal/controllers/status.go b/controlplane/kubeadm/internal/controllers/status.go
index c17d62c0cf7c..3d742bdac7f4 100644
--- a/controlplane/kubeadm/internal/controllers/status.go
+++ b/controlplane/kubeadm/internal/controllers/status.go
@@ -116,5 +116,33 @@ func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, kcp *c
 		kcp.Status.Ready = true
 	}
 
+	// Surface lastRemediation data in status.
+	// LastRemediation is the remediation currently in progress, in any, or the
+	// most recent of the remediation we are keeping track on machines.
+	var lastRemediation *RemediationData
+
+	if v, ok := kcp.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
+		remediationData, err := RemediationDataFromAnnotation(v)
+		if err != nil {
+			return err
+		}
+		lastRemediation = remediationData
+	} else {
+		for _, m := range ownedMachines.UnsortedList() {
+			if v, ok := m.Annotations[controlplanev1.RemediationForAnnotation]; ok {
+				remediationData, err := RemediationDataFromAnnotation(v)
+				if err != nil {
+					return err
+				}
+				if lastRemediation == nil || lastRemediation.Timestamp.Time.Before(remediationData.Timestamp.Time) {
+					lastRemediation = remediationData
+				}
+			}
+		}
+	}
+
+	if lastRemediation != nil {
+		kcp.Status.LastRemediation = lastRemediation.ToStatus()
+	}
 	return nil
 }
diff --git a/docs/book/src/developer/providers/v1.3-to-v1.4.md b/docs/book/src/developer/providers/v1.3-to-v1.4.md
index 5bd68da0e3fc..5873897d4627 100644
--- a/docs/book/src/developer/providers/v1.3-to-v1.4.md
+++ b/docs/book/src/developer/providers/v1.3-to-v1.4.md
@@ -67,6 +67,8 @@ maintainers of providers and consumers of our Go API.
   Please note that the following logging flags have been removed: (in `component-base`, but this affects all CAPI controllers): `--add-dir-header`, `--alsologtostderr`, `--log-backtrace-at`,
   `--log-dir`, `--log-file`, `--log-file-max-size`, `--logtostderr`, `--one-output`, `--skip-headers`, `--skip-log-headers` and `--stderrthreshold`.
   For more information, please see: https://github.com/kubernetes/enhancements/issues/2845
+- A new `KCPRemediationSpec` test has been added providing better test coverage for KCP remediation most common use cases. As a consequence `MachineRemediationSpec` now only tests remediation of
+  worker machines (NOTE: we plan to improve this test as well in a future iteration).
 
 ### Suggested changes for providers
 
diff --git a/docs/book/src/reference/labels_and_annotations.md b/docs/book/src/reference/labels_and_annotations.md
index 78ef2fcbbbfd..a8a2172c6a27 100644
--- a/docs/book/src/reference/labels_and_annotations.md
+++ b/docs/book/src/reference/labels_and_annotations.md
@@ -49,3 +49,5 @@
 | controlplane.cluster.x-k8s.io/skip-coredns                       | It explicitly skips reconciling CoreDNS if set.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
 | controlplane.cluster.x-k8s.io/skip-kube-proxy                    | It explicitly skips reconciling kube-proxy if set.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
 | controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration      | It is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration. This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP.                                                                                                                                                                                                                                                                                                                                                    |
+| controlplane.cluster.x-k8s.io/remediation-in-progress            | It is a KCP annotation that tracks that the system is in between having deleted an unhealthy machine and recreating its replacement.                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| controlplane.cluster.x-k8s.io/remediation-for                    | It is a machine annotation that links a new machine to the unhealthy machine it is replacing.                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go
index f2b98f34b581..2d0557132958 100644
--- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go
+++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go
@@ -199,15 +199,19 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster
 		UID:        cluster.UID,
 	})
 
-	// Get the remote cluster cache to use as a client.Reader.
-	remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
-	if err != nil {
-		logger.Error(err, "error creating remote cluster cache")
-		return ctrl.Result{}, err
-	}
+	// If the cluster is already initialized, get the remote cluster cache to use as a client.Reader.
+	var remoteClient client.Client
+	if conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) {
+		var err error
+		remoteClient, err = r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
+		if err != nil {
+			logger.Error(err, "error creating remote cluster cache")
+			return ctrl.Result{}, err
+		}
 
-	if err := r.watchClusterNodes(ctx, cluster); err != nil {
-		return ctrl.Result{}, err
+		if err := r.watchClusterNodes(ctx, cluster); err != nil {
+			return ctrl.Result{}, err
+		}
 	}
 
 	// fetch all targets
diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go
index 97eb0659bf0c..5d174fb63155 100644
--- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go
+++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go
@@ -31,6 +31,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
+	"sigs.k8s.io/cluster-api/util"
 	"sigs.k8s.io/cluster-api/util/annotations"
 	"sigs.k8s.io/cluster-api/util/conditions"
 	"sigs.k8s.io/cluster-api/util/patch"
@@ -111,8 +112,9 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 		return true, time.Duration(0)
 	}
 
-	// Don't penalize any Machine/Node if the control plane has not been initialized.
-	if !conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) {
+	// Don't penalize any Machine/Node if the control plane has not been initialized
+	// Exception of this rule are control plane machine itself, so the first control plane machine can be remediated.
+	if !conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && !util.IsControlPlaneMachine(t.Machine) {
 		logger.V(3).Info("Not evaluating target health because the control plane has not yet been initialized")
 		// Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated.
 		return false, 0
@@ -133,18 +135,18 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 			return false, 0
 		}
 
-		controlPlaneInitializedTime := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ControlPlaneInitializedCondition).Time
-		clusterInfraReadyTime := conditions.GetLastTransitionTime(t.Cluster, clusterv1.InfrastructureReadyCondition).Time
+		controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ControlPlaneInitializedCondition)
+		clusterInfraReady := conditions.GetLastTransitionTime(t.Cluster, clusterv1.InfrastructureReadyCondition)
 		machineCreationTime := t.Machine.CreationTimestamp.Time
 
 		// Use the latest of the 3 times
 		comparisonTime := machineCreationTime
-		logger.V(3).Info("Determining comparison time", "machineCreationTime", machineCreationTime, "clusterInfraReadyTime", clusterInfraReadyTime, "controlPlaneInitializedTime", controlPlaneInitializedTime)
-		if controlPlaneInitializedTime.After(comparisonTime) {
-			comparisonTime = controlPlaneInitializedTime
+		logger.V(3).Info("Determining comparison time", "machineCreationTime", machineCreationTime, "clusterInfraReadyTime", clusterInfraReady, "controlPlaneInitializedTime", controlPlaneInitialized)
+		if conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && controlPlaneInitialized != nil && controlPlaneInitialized.Time.After(comparisonTime) {
+			comparisonTime = controlPlaneInitialized.Time
 		}
-		if clusterInfraReadyTime.After(comparisonTime) {
-			comparisonTime = clusterInfraReadyTime
+		if conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) && clusterInfraReady != nil && clusterInfraReady.Time.After(comparisonTime) {
+			comparisonTime = clusterInfraReady.Time
 		}
 		logger.V(3).Info("Using comparison time", "time", comparisonTime)
 
@@ -218,16 +220,18 @@ func (r *Reconciler) getTargetsFromMHC(ctx context.Context, logger logr.Logger,
 			Machine:     &machines[k],
 			patchHelper: patchHelper,
 		}
-		node, err := r.getNodeFromMachine(ctx, clusterClient, target.Machine)
-		if err != nil {
-			if !apierrors.IsNotFound(err) {
-				return nil, errors.Wrap(err, "error getting node")
+		if clusterClient != nil {
+			node, err := r.getNodeFromMachine(ctx, clusterClient, target.Machine)
+			if err != nil {
+				if !apierrors.IsNotFound(err) {
+					return nil, errors.Wrap(err, "error getting node")
+				}
+
+				// A node has been seen for this machine, but it no longer exists
+				target.nodeMissing = true
 			}
-
-			// A node has been seen for this machine, but it no longer exists
-			target.nodeMissing = true
+			target.Node = node
 		}
-		target.Node = node
 		targets = append(targets, target)
 	}
 	return targets, nil
diff --git a/test/e2e/config/docker.yaml b/test/e2e/config/docker.yaml
index 708ecef3b2e0..74a52cd3647e 100644
--- a/test/e2e/config/docker.yaml
+++ b/test/e2e/config/docker.yaml
@@ -290,3 +290,6 @@ intervals:
   node-drain/wait-deployment-available: ["3m", "10s"]
   node-drain/wait-control-plane: ["15m", "10s"]
   node-drain/wait-machine-deleted: ["2m", "10s"]
+  kcp-remediation/wait-machines: ["5m", "10s"]
+  kcp-remediation/check-machines-stable: ["30s", "5s"]
+  kcp-remediation/wait-machine-provisioned: ["5m", "10s"]
diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/cluster-with-kcp.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/cluster-with-kcp.yaml
new file mode 100644
index 000000000000..12516bab5ccd
--- /dev/null
+++ b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/cluster-with-kcp.yaml
@@ -0,0 +1,98 @@
+---
+# DockerCluster object referenced by the Cluster object
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+kind: DockerCluster
+metadata:
+  name: '${CLUSTER_NAME}'
+---
+# Cluster object with
+# - Reference to the KubeadmControlPlane object
+# - the label cni=${CLUSTER_NAME}-crs-0, so the cluster can be selected by the ClusterResourceSet.
+apiVersion: cluster.x-k8s.io/v1beta1
+kind: Cluster
+metadata:
+  name: '${CLUSTER_NAME}'
+  labels:
+    cni: "${CLUSTER_NAME}-crs-0"
+spec:
+  clusterNetwork:
+    services:
+      cidrBlocks: ['${DOCKER_SERVICE_CIDRS}']
+    pods:
+      cidrBlocks: ['${DOCKER_POD_CIDRS}']
+    serviceDomain: '${DOCKER_SERVICE_DOMAIN}'
+  infrastructureRef:
+    apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+    kind: DockerCluster
+    name: '${CLUSTER_NAME}'
+  controlPlaneRef:
+    kind: KubeadmControlPlane
+    apiVersion: controlplane.cluster.x-k8s.io/v1beta1
+    name: "${CLUSTER_NAME}-control-plane"
+---
+# DockerMachineTemplate object referenced by the KubeadmControlPlane object
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+kind: DockerMachineTemplate
+metadata:
+  name: "${CLUSTER_NAME}-control-plane"
+spec:
+  template:
+    spec: {}
+---
+# KubeadmControlPlane referenced by the Cluster
+kind: KubeadmControlPlane
+apiVersion: controlplane.cluster.x-k8s.io/v1beta1
+metadata:
+  name: "${CLUSTER_NAME}-control-plane"
+spec:
+  replicas: ${CONTROL_PLANE_MACHINE_COUNT}
+  machineTemplate:
+    infrastructureRef:
+      kind: DockerMachineTemplate
+      apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+      name: "${CLUSTER_NAME}-control-plane"
+  kubeadmConfigSpec:
+    clusterConfiguration:
+      controllerManager:
+        extraArgs: {enable-hostpath-provisioner: 'true'}
+      apiServer:
+        # host.docker.internal is required by kubetest when running on MacOS because of the way ports are proxied.
+        certSANs: [localhost, 127.0.0.1, 0.0.0.0, host.docker.internal]
+    initConfiguration:
+      nodeRegistration:
+        kubeletExtraArgs:
+          eviction-hard: 'nodefs.available<0%,nodefs.inodesFree<0%,imagefs.available<0%'
+    joinConfiguration:
+      nodeRegistration:
+        kubeletExtraArgs:
+          eviction-hard: 'nodefs.available<0%,nodefs.inodesFree<0%,imagefs.available<0%'
+    files:
+    - path: /wait-signal.sh
+      content: |
+        #!/bin/bash
+  
+        set -o errexit
+        set -o pipefail
+        
+        echo "Waiting for signal..."
+        
+        TOKEN=$1
+        SERVER=$2
+        NAMESPACE=$3
+        
+        while true;
+        do
+          sleep 1s
+  
+          signal=$(curl -k -s --header "Authorization: Bearer $TOKEN" $SERVER/api/v1/namespaces/$NAMESPACE/configmaps/mhc-test | jq -r .data.signal?)
+          echo "signal $signal"
+  
+          if [ "$signal" == "pass" ]; then
+             curl -k -s --header "Authorization: Bearer $TOKEN" -XPATCH -H "Content-Type: application/strategic-merge-patch+json" --data '{"data": {"signal": "ack-pass"}}' $SERVER/api/v1/namespaces/$NAMESPACE/configmaps/mhc-test
+             exit 0
+          fi
+        done
+      permissions: "0777"
+    preKubeadmCommands:
+    - ./wait-signal.sh "${TOKEN}" "${SERVER}" "${NAMESPACE}"
+  version: "${KUBERNETES_VERSION}"
diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/kustomization.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/kustomization.yaml
index e234e37be1b2..92761f83d820 100644
--- a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/kustomization.yaml
+++ b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/kustomization.yaml
@@ -1,5 +1,5 @@
 bases:
-  - ../bases/cluster-with-kcp.yaml
+  - cluster-with-kcp.yaml
   - ../bases/md.yaml
   - ../bases/crs.yaml
   - mhc.yaml
diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml
index 3ed3e0a9473a..39187cec0a40 100644
--- a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml
+++ b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml
@@ -1,7 +1,8 @@
 ---
 # MachineHealthCheck object with
-# - a selector that targets all the machines with label cluster.x-k8s.io/control-plane=""
-# - unhealthyConditions triggering remediation after 10s the condition is set
+# - a selector that targets all the machines with label cluster.x-k8s.io/control-plane="" and the mhc-test: "fail" (the label is used to trigger remediation in a controlled way - by adding CP under MHC control intentionally -)
+# - nodeStartupTimeout: 30s (to force remediation on nodes still provisioning)
+# - unhealthyConditions triggering remediation after 10s the e2e.remediation.condition condition is set to false (to force remediation on nodes already provisioned)
 apiVersion: cluster.x-k8s.io/v1beta1
 kind: MachineHealthCheck
 metadata:
@@ -12,6 +13,8 @@ spec:
   selector:
     matchLabels:
       cluster.x-k8s.io/control-plane: ""
+      mhc-test: "fail"
+  nodeStartupTimeout: 30s
   unhealthyConditions:
     - type: e2e.remediation.condition
       status: "False"
diff --git a/test/e2e/kcp_remediations.go b/test/e2e/kcp_remediations.go
new file mode 100644
index 000000000000..64fc229b0b59
--- /dev/null
+++ b/test/e2e/kcp_remediations.go
@@ -0,0 +1,687 @@
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	authenticationv1 "k8s.io/api/authentication/v1"
+	corev1 "k8s.io/api/core/v1"
+	rbacv1 "k8s.io/api/rbac/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/client-go/tools/clientcmd"
+	"k8s.io/utils/pointer"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+
+	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
+	"sigs.k8s.io/cluster-api/test/e2e/internal/log"
+	"sigs.k8s.io/cluster-api/test/framework"
+	"sigs.k8s.io/cluster-api/test/framework/clusterctl"
+	"sigs.k8s.io/cluster-api/util"
+)
+
+const (
+	configMapName    = "mhc-test"
+	configMapDataKey = "signal"
+
+	failLabelValue = "fail"
+)
+
+// KCPRemediationSpecInput is the input for KCPRemediationSpec.
+type KCPRemediationSpecInput struct {
+	// This spec requires following intervals to be defined in order to work:
+	// - wait-cluster, used when waiting for the cluster infrastructure to be provisioned.
+	// - wait-machines, used when waiting for an old machine to be remediated and a new one provisioned.
+	// - check-machines-stable, used when checking that the current list of machines in stable.
+	// - wait-machine-provisioned, used when waiting for a machine to be provisioned after unblocking bootstrap.
+	E2EConfig             *clusterctl.E2EConfig
+	ClusterctlConfigPath  string
+	BootstrapClusterProxy framework.ClusterProxy
+	ArtifactFolder        string
+	SkipCleanup           bool
+
+	// Flavor, if specified, must refer to a template that has a MachineHealthCheck
+	// - 3 node CP, no workers
+	// - Control plane machines having a pre-kubeadm command that queries for a well-known ConfigMap on the management cluster,
+	//   holding up bootstrap until a signal is passed via the config map.
+	//   NOTE: In order for this to work communications from workload cluster to management cluster must be enabled.
+	// - An MHC targeting control plane machines with the mhc-test=fail labels and
+	//     nodeStartupTimeout: 30s
+	// 	   unhealthyConditions:
+	//     - type: e2e.remediation.condition
+	//       status: "False"
+	// 	     timeout: 10s
+	// If not specified, "kcp-remediation" is used.
+	Flavor *string
+}
+
+// KCPRemediationSpec implements a test that verifies that Machines are remediated by MHC during unhealthy conditions.
+func KCPRemediationSpec(ctx context.Context, inputGetter func() KCPRemediationSpecInput) {
+	var (
+		specName         = "kcp-remediation"
+		input            KCPRemediationSpecInput
+		namespace        *corev1.Namespace
+		cancelWatches    context.CancelFunc
+		clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
+	)
+
+	BeforeEach(func() {
+		Expect(ctx).NotTo(BeNil(), "ctx is required for %s spec", specName)
+		input = inputGetter()
+		Expect(input.E2EConfig).ToNot(BeNil(), "Invalid argument. input.E2EConfig can't be nil when calling %s spec", specName)
+		Expect(input.ClusterctlConfigPath).To(BeAnExistingFile(), "Invalid argument. input.ClusterctlConfigPath must be an existing file when calling %s spec", specName)
+		Expect(input.BootstrapClusterProxy).ToNot(BeNil(), "Invalid argument. input.BootstrapClusterProxy can't be nil when calling %s spec", specName)
+		Expect(os.MkdirAll(input.ArtifactFolder, 0750)).To(Succeed(), "Invalid argument. input.ArtifactFolder can't be created for %s spec", specName)
+		Expect(input.E2EConfig.Variables).To(HaveKey(KubernetesVersion))
+
+		// Setup a Namespace where to host objects for this spec and create a watcher for the namespace events.
+		namespace, cancelWatches = setupSpecNamespace(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder)
+	})
+
+	It("Should replace unhealthy machines", func() {
+		By("Creating a workload cluster")
+
+		// NOTE: This test is quite different from other tests, because it has to trigger failures on machines in a controlled ways.
+
+		// creates the mhc-test ConfigMap that will be used to control machines bootstrap during the remediation tests.
+		createConfigMapForMachinesBootstrapSignal(ctx, input.BootstrapClusterProxy.GetClient(), namespace.Name)
+
+		// Creates the workload cluster.
+		clusterResources = createWorkloadClusterAndWait(ctx, createWorkloadClusterAndWaitInput{
+			E2EConfig:            input.E2EConfig,
+			ClusterctlConfigPath: input.ClusterctlConfigPath,
+			Proxy:                input.BootstrapClusterProxy,
+			ArtifactFolder:       input.ArtifactFolder,
+			SpecName:             specName,
+			Flavor:               pointer.StringDeref(input.Flavor, "kcp-remediation"),
+
+			// values to be injected in the template
+
+			Namespace: namespace.Name,
+			// Token with credentials to use for accessing the ConfigMap on managements cluster from the workload cluster.
+			// NOTE: this func also setups credentials/RBAC rules and everything necessary to get the authenticationToken.
+			AuthenticationToken: getAuthenticationToken(ctx, input.BootstrapClusterProxy, namespace.Name),
+			// Address to be used for accessing the management cluster from a workload cluster.
+			ServerAddr: getServerAddr(ctx, input.BootstrapClusterProxy),
+		})
+
+		// The first CP machine comes up but it does not complete bootstrap
+
+		By("FIRST CONTROL PLANE MACHINE")
+
+		By("Wait for the cluster to get stuck with the first CP machine not completing the bootstrap")
+		allMachines, newMachines := waitForMachines(ctx, waitForMachinesInput{
+			Lister:                          input.BootstrapClusterProxy.GetClient(),
+			Namespace:                       namespace.Name,
+			ClusterName:                     clusterResources.Cluster.Name,
+			ExpectedReplicas:                1,
+			WaitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			CheckMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(1))
+		Expect(newMachines).To(HaveLen(1))
+		firstMachineName := newMachines[0]
+		firstMachine := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      firstMachineName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(firstMachine), firstMachine)).To(Succeed(), "Failed to get machine %d", firstMachineName)
+		Expect(firstMachine.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", firstMachineName)
+
+		// Intentionally trigger remediation on the first CP, and validate the first machine is deleted and a replacement should come up.
+
+		By("REMEDIATING FIRST CONTROL PLANE MACHINE")
+
+		Byf("Add mhc-test:fail label to machine %s so it will be immediately remediated", firstMachineName)
+		firstMachineWithLabel := firstMachine.DeepCopy()
+		firstMachineWithLabel.Labels["mhc-test"] = failLabelValue
+		Expect(input.BootstrapClusterProxy.GetClient().Patch(ctx, firstMachineWithLabel, client.MergeFrom(firstMachine))).To(Succeed(), "Failed to patch machine %d", firstMachineName)
+
+		log.Logf("Wait for the first CP machine to be remediated, and the replacement machine to come up, but again get stuck with the Machine not completing the bootstrap")
+		allMachines, newMachines = waitForMachines(ctx, waitForMachinesInput{
+			Lister:                          input.BootstrapClusterProxy.GetClient(),
+			Namespace:                       namespace.Name,
+			ClusterName:                     clusterResources.Cluster.Name,
+			ExpectedReplicas:                1,
+			ExpectedDeletedMachines:         []string{firstMachineName},
+			WaitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			CheckMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(1))
+		Expect(newMachines).To(HaveLen(1))
+		firstMachineReplacementName := newMachines[0]
+		firstMachineReplacement := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      firstMachineReplacementName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(firstMachineReplacement), firstMachineReplacement)).To(Succeed(), "Failed to get machine %d", firstMachineReplacementName)
+		Expect(firstMachineReplacement.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", firstMachineReplacementName)
+
+		// The firstMachine replacement is up, meaning that the test validated that remediation of the first CP machine works (note: first CP is a special case because the cluster is not initialized yet).
+		// In order to test remediation of other machine while provisioning we unblock bootstrap of the first CP replacement
+		// and wait for the second cp machine to come up.
+
+		By("FIRST CONTROL PLANE MACHINE SUCCESSFULLY REMEDIATED!")
+
+		Byf("Unblock bootstrap for Machine %s and wait for it to be provisioned", firstMachineReplacementName)
+		sendSignalToBootstrappingMachine(ctx, sendSignalToBootstrappingMachineInput{
+			Client:    input.BootstrapClusterProxy.GetClient(),
+			Namespace: namespace.Name,
+			Machine:   firstMachineReplacementName,
+			Signal:    "pass",
+		})
+		log.Logf("Waiting for Machine %s to be provisioned", firstMachineReplacementName)
+		Eventually(func() bool {
+			if err := input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(firstMachineReplacement), firstMachineReplacement); err != nil {
+				return false
+			}
+			return firstMachineReplacement.Status.NodeRef != nil
+		}, input.E2EConfig.GetIntervals(specName, "wait-machine-provisioned")...).Should(BeTrue(), "Machine %s failed to be provisioned", firstMachineReplacementName)
+
+		By("FIRST CONTROL PLANE MACHINE UP AND RUNNING!")
+		By("START PROVISIONING OF SECOND CONTROL PLANE MACHINE!")
+
+		By("Wait for the cluster to get stuck with the second CP machine not completing the bootstrap")
+		allMachines, newMachines = waitForMachines(ctx, waitForMachinesInput{
+			Lister:                          input.BootstrapClusterProxy.GetClient(),
+			Namespace:                       namespace.Name,
+			ClusterName:                     clusterResources.Cluster.Name,
+			ExpectedReplicas:                2,
+			ExpectedDeletedMachines:         []string{},
+			ExpectedOldMachines:             []string{firstMachineReplacementName},
+			WaitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			CheckMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(2))
+		Expect(newMachines).To(HaveLen(1))
+		secondMachineName := newMachines[0]
+		secondMachine := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      secondMachineName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(secondMachine), secondMachine)).To(Succeed(), "Failed to get machine %d", secondMachineName)
+		Expect(secondMachine.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", secondMachineName)
+
+		// Intentionally trigger remediation on the second CP and validate that also this one is deleted and a replacement should come up.
+
+		By("REMEDIATING SECOND CONTROL PLANE MACHINE")
+
+		Byf("Add mhc-test:fail label to machine %s so it will be immediately remediated", firstMachineName)
+		secondMachineWithLabel := secondMachine.DeepCopy()
+		secondMachineWithLabel.Labels["mhc-test"] = failLabelValue
+		Expect(input.BootstrapClusterProxy.GetClient().Patch(ctx, secondMachineWithLabel, client.MergeFrom(secondMachine))).To(Succeed(), "Failed to patch machine %d", secondMachineName)
+
+		log.Logf("Wait for the second CP machine to be remediated, and the replacement machine to come up, but again get stuck with the Machine not completing the bootstrap")
+		allMachines, newMachines = waitForMachines(ctx, waitForMachinesInput{
+			Lister:                          input.BootstrapClusterProxy.GetClient(),
+			Namespace:                       namespace.Name,
+			ClusterName:                     clusterResources.Cluster.Name,
+			ExpectedReplicas:                2,
+			ExpectedDeletedMachines:         []string{secondMachineName},
+			ExpectedOldMachines:             []string{firstMachineReplacementName},
+			WaitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			CheckMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(2))
+		Expect(newMachines).To(HaveLen(1))
+		secondMachineReplacementName := newMachines[0]
+		secondMachineReplacement := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      secondMachineReplacementName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(secondMachineReplacement), secondMachineReplacement)).To(Succeed(), "Failed to get machine %d", secondMachineReplacementName)
+		Expect(secondMachineReplacement.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", secondMachineReplacementName)
+
+		// The secondMachine replacement is up, meaning that the test validated that remediation of the second CP machine works (note: this test remediation after the cluster is initialized, but not yet fully provisioned).
+		// In order to test remediation after provisioning we unblock bootstrap of the second CP replacement as well as for the third CP machine.
+		// and wait for the second cp machine to come up.
+
+		By("SECOND CONTROL PLANE MACHINE SUCCESSFULLY REMEDIATED!")
+
+		Byf("Unblock bootstrap for Machine %s and wait for it to be provisioned", secondMachineReplacementName)
+		sendSignalToBootstrappingMachine(ctx, sendSignalToBootstrappingMachineInput{
+			Client:    input.BootstrapClusterProxy.GetClient(),
+			Namespace: namespace.Name,
+			Machine:   secondMachineReplacementName,
+			Signal:    "pass",
+		})
+		log.Logf("Waiting for Machine %s to be provisioned", secondMachineReplacementName)
+		Eventually(func() bool {
+			if err := input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(secondMachineReplacement), secondMachineReplacement); err != nil {
+				return false
+			}
+			return secondMachineReplacement.Status.NodeRef != nil
+		}, input.E2EConfig.GetIntervals(specName, "wait-machine-provisioned")...).Should(BeTrue(), "Machine %s failed to be provisioned", secondMachineReplacementName)
+
+		By("SECOND CONTROL PLANE MACHINE UP AND RUNNING!")
+		By("START PROVISIONING OF THIRD CONTROL PLANE MACHINE!")
+
+		By("Wait for the cluster to get stuck with the third CP machine not completing the bootstrap")
+		allMachines, newMachines = waitForMachines(ctx, waitForMachinesInput{
+			Lister:                          input.BootstrapClusterProxy.GetClient(),
+			Namespace:                       namespace.Name,
+			ClusterName:                     clusterResources.Cluster.Name,
+			ExpectedReplicas:                3,
+			ExpectedDeletedMachines:         []string{},
+			ExpectedOldMachines:             []string{firstMachineReplacementName, secondMachineReplacementName},
+			WaitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			CheckMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(3))
+		Expect(newMachines).To(HaveLen(1))
+		thirdMachineName := newMachines[0]
+		thirdMachine := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      thirdMachineName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(thirdMachine), thirdMachine)).To(Succeed(), "Failed to get machine %d", thirdMachineName)
+		Expect(thirdMachine.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", thirdMachineName)
+
+		Byf("Unblock bootstrap for Machine %s and wait for it to be provisioned", thirdMachineName)
+		sendSignalToBootstrappingMachine(ctx, sendSignalToBootstrappingMachineInput{
+			Client:    input.BootstrapClusterProxy.GetClient(),
+			Namespace: namespace.Name,
+			Machine:   thirdMachineName,
+			Signal:    "pass",
+		})
+		log.Logf("Waiting for Machine %s to be provisioned", thirdMachineName)
+		Eventually(func() bool {
+			if err := input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(thirdMachine), thirdMachine); err != nil {
+				return false
+			}
+			return thirdMachine.Status.NodeRef != nil
+		}, input.E2EConfig.GetIntervals(specName, "wait-machine-provisioned")...).Should(BeTrue(), "Machine %s failed to be provisioned", thirdMachineName)
+
+		// All three CP machines are up.
+
+		By("ALL THE CONTROL PLANE MACHINES SUCCESSFULLY PROVISIONED!")
+
+		// We now want to test remediation of a CP machine already provisioned.
+		// In order to do so we need to apply both mhc-test:fail as well as setting an unhealthy condition in order to trigger remediation
+
+		By("REMEDIATING THIRD CP")
+
+		Byf("Add mhc-test:fail label to machine %s and set an unhealthy condition on the node so it will be immediately remediated", thirdMachineName)
+		thirdMachineWithLabel := thirdMachine.DeepCopy()
+		thirdMachineWithLabel.Labels["mhc-test"] = failLabelValue
+		Expect(input.BootstrapClusterProxy.GetClient().Patch(ctx, thirdMachineWithLabel, client.MergeFrom(thirdMachine))).To(Succeed(), "Failed to patch machine %d", thirdMachineName)
+
+		unhealthyNodeCondition := corev1.NodeCondition{
+			Type:               "e2e.remediation.condition",
+			Status:             "False",
+			LastTransitionTime: metav1.Time{Time: time.Now()},
+		}
+		framework.PatchNodeCondition(ctx, framework.PatchNodeConditionInput{
+			ClusterProxy:  input.BootstrapClusterProxy,
+			Cluster:       clusterResources.Cluster,
+			NodeCondition: unhealthyNodeCondition,
+			Machine:       *thirdMachine, // TODO: make this a pointer.
+		})
+
+		log.Logf("Wait for the third CP machine to be remediated, and the replacement machine to come up, but again get stuck with the Machine not completing the bootstrap")
+		allMachines, newMachines = waitForMachines(ctx, waitForMachinesInput{
+			Lister:                          input.BootstrapClusterProxy.GetClient(),
+			Namespace:                       namespace.Name,
+			ClusterName:                     clusterResources.Cluster.Name,
+			ExpectedReplicas:                3,
+			ExpectedDeletedMachines:         []string{thirdMachineName},
+			ExpectedOldMachines:             []string{firstMachineReplacementName, secondMachineReplacementName},
+			WaitForMachinesIntervals:        input.E2EConfig.GetIntervals(specName, "wait-machines"),
+			CheckMachineListStableIntervals: input.E2EConfig.GetIntervals(specName, "check-machines-stable"),
+		})
+		Expect(allMachines).To(HaveLen(3))
+		Expect(newMachines).To(HaveLen(1))
+		thirdMachineReplacementName := newMachines[0]
+		thirdMachineReplacement := &clusterv1.Machine{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      thirdMachineReplacementName,
+				Namespace: namespace.Name,
+			},
+		}
+		Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(thirdMachineReplacement), thirdMachineReplacement)).To(Succeed(), "Failed to get machine %d", thirdMachineReplacementName)
+		Expect(thirdMachineReplacement.Status.NodeRef).To(BeNil())
+		log.Logf("Machine %s is up but still bootstrapping", thirdMachineReplacementName)
+
+		// The thirdMachine replacement is up, meaning that the test validated that remediation of the third CP machine works (note: this test remediation after the cluster is fully provisioned).
+
+		By("THIRD CP SUCCESSFULLY REMEDIATED!")
+
+		Byf("Unblock bootstrap for Machine %s and wait for it to be provisioned", thirdMachineReplacementName)
+		sendSignalToBootstrappingMachine(ctx, sendSignalToBootstrappingMachineInput{
+			Client:    input.BootstrapClusterProxy.GetClient(),
+			Namespace: namespace.Name,
+			Machine:   thirdMachineReplacementName,
+			Signal:    "pass",
+		})
+		log.Logf("Waiting for Machine %s to be provisioned", thirdMachineReplacementName)
+		Eventually(func() bool {
+			if err := input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(thirdMachineReplacement), thirdMachineReplacement); err != nil {
+				return false
+			}
+			return thirdMachineReplacement.Status.NodeRef != nil
+		}, input.E2EConfig.GetIntervals(specName, "wait-machine-provisioned")...).Should(BeTrue(), "Machine %s failed to be provisioned", thirdMachineReplacementName)
+
+		// All three CP machines are up again.
+
+		By("CP BACK TO FULL OPERATIONAL STATE!")
+
+		By("PASSED!")
+	})
+
+	AfterEach(func() {
+		// Dumps all the resources in the spec namespace, then cleanups the cluster object and the spec namespace itself.
+		dumpSpecResourcesAndCleanup(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder, namespace, cancelWatches, clusterResources.Cluster, input.E2EConfig.GetIntervals, input.SkipCleanup)
+	})
+}
+
+func createConfigMapForMachinesBootstrapSignal(ctx context.Context, writer client.Writer, namespace string) {
+	cm := &corev1.ConfigMap{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      configMapName,
+			Namespace: namespace,
+		},
+		Data: map[string]string{
+			configMapDataKey: "hold",
+		},
+	}
+	Expect(writer.Create(ctx, cm)).To(Succeed(), "failed to create mhc-test config map")
+}
+
+type createWorkloadClusterAndWaitInput struct {
+	E2EConfig            *clusterctl.E2EConfig
+	ClusterctlConfigPath string
+	Proxy                framework.ClusterProxy
+	ArtifactFolder       string
+	SpecName             string
+	Flavor               string
+	Namespace            string
+	AuthenticationToken  string
+	ServerAddr           string
+}
+
+// createWorkloadClusterAndWait creates a workload cluster and return as soon as the cluster infrastructure is ready.
+// NOTE: we are not using the same func used by other tests because it would fail if the control plane doesn't come up,
+// which instead is expected in this case.
+// NOTE: clusterResources is filled only partially.
+func createWorkloadClusterAndWait(ctx context.Context, input createWorkloadClusterAndWaitInput) (clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult) {
+	clusterResources = new(clusterctl.ApplyClusterTemplateAndWaitResult)
+
+	// gets the cluster template
+	log.Logf("Getting the cluster template yaml")
+	clusterName := fmt.Sprintf("%s-%s", input.SpecName, util.RandomString(6))
+	workloadClusterTemplate := clusterctl.ConfigCluster(ctx, clusterctl.ConfigClusterInput{
+		// pass the clusterctl config file that points to the local provider repository created for this test,
+		ClusterctlConfigPath: input.ClusterctlConfigPath,
+		// pass reference to the management cluster hosting this test
+		KubeconfigPath: input.Proxy.GetKubeconfigPath(),
+
+		// select template
+		Flavor: input.Flavor,
+		// define template variables
+		Namespace:                input.Namespace,
+		ClusterName:              clusterName,
+		KubernetesVersion:        input.E2EConfig.GetVariable(KubernetesVersion),
+		ControlPlaneMachineCount: pointer.Int64(3),
+		WorkerMachineCount:       pointer.Int64(0),
+		InfrastructureProvider:   clusterctl.DefaultInfrastructureProvider,
+		// setup clusterctl logs folder
+		LogFolder: filepath.Join(input.ArtifactFolder, "clusters", input.Proxy.GetName()),
+		// Adds authenticationToken, server address and namespace variables to be injected in the cluster template.
+		ClusterctlVariables: map[string]string{
+			"TOKEN":     input.AuthenticationToken,
+			"SERVER":    input.ServerAddr,
+			"NAMESPACE": input.Namespace,
+		},
+	})
+	Expect(workloadClusterTemplate).ToNot(BeNil(), "Failed to get the cluster template")
+
+	Eventually(func() error {
+		return input.Proxy.Apply(ctx, workloadClusterTemplate)
+	}, 10*time.Second).Should(Succeed(), "Failed to apply the cluster template")
+
+	log.Logf("Waiting for the cluster infrastructure to be provisioned")
+	clusterResources.Cluster = framework.DiscoveryAndWaitForCluster(ctx, framework.DiscoveryAndWaitForClusterInput{
+		Getter:    input.Proxy.GetClient(),
+		Namespace: input.Namespace,
+		Name:      clusterName,
+	}, input.E2EConfig.GetIntervals(input.SpecName, "wait-cluster")...)
+
+	return clusterResources
+}
+
+type sendSignalToBootstrappingMachineInput struct {
+	Client    client.Client
+	Namespace string
+	Machine   string
+	Signal    string
+}
+
+// sendSignalToBootstrappingMachine sends a signal to a machine stuck during bootstrap.
+func sendSignalToBootstrappingMachine(ctx context.Context, input sendSignalToBootstrappingMachineInput) {
+	log.Logf("Sending bootstrap signal %s to Machine %s", input.Signal, input.Machine)
+	cm := &corev1.ConfigMap{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      configMapName,
+			Namespace: input.Namespace,
+		},
+	}
+	Expect(input.Client.Get(ctx, client.ObjectKeyFromObject(cm), cm)).To(Succeed(), "failed to get mhc-test config map")
+
+	cmWithSignal := cm.DeepCopy()
+	cmWithSignal.Data[configMapDataKey] = input.Signal
+	Expect(input.Client.Patch(ctx, cmWithSignal, client.MergeFrom(cm))).To(Succeed(), "failed to patch mhc-test config map")
+
+	log.Logf("Waiting for Machine %s to acknowledge signal %s has been received", input.Machine, input.Signal)
+	Eventually(func() string {
+		_ = input.Client.Get(ctx, client.ObjectKeyFromObject(cmWithSignal), cmWithSignal)
+		return cmWithSignal.Data[configMapDataKey]
+	}, "1m", "10s").Should(Equal(fmt.Sprintf("ack-%s", input.Signal)), "Failed to get ack signal from machine %s", input.Machine)
+
+	machine := &clusterv1.Machine{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      input.Machine,
+			Namespace: input.Namespace,
+		},
+	}
+	Expect(input.Client.Get(ctx, client.ObjectKeyFromObject(machine), machine)).To(Succeed())
+
+	// Resetting the signal in the config map
+	cmWithSignal.Data[configMapDataKey] = "hold"
+	Expect(input.Client.Patch(ctx, cmWithSignal, client.MergeFrom(cm))).To(Succeed(), "failed to patch mhc-test config map")
+}
+
+type waitForMachinesInput struct {
+	Lister                          framework.Lister
+	Namespace                       string
+	ClusterName                     string
+	ExpectedReplicas                int
+	ExpectedOldMachines             []string
+	ExpectedDeletedMachines         []string
+	WaitForMachinesIntervals        []interface{}
+	CheckMachineListStableIntervals []interface{}
+}
+
+// waitForMachines waits for machines to reach a well known state defined by number of replicas, a list of machines to exists,
+// a list of machines to not exist anymore. The func also check that the state is stable for some time before
+// returning the list of new machines.
+func waitForMachines(ctx context.Context, input waitForMachinesInput) (allMachineNames, newMachineNames []string) {
+	inClustersNamespaceListOption := client.InNamespace(input.Namespace)
+	matchClusterListOption := client.MatchingLabels{
+		clusterv1.ClusterNameLabel:         input.ClusterName,
+		clusterv1.MachineControlPlaneLabel: "",
+	}
+
+	expectedOldMachines := sets.Set[string]{}.Insert(input.ExpectedOldMachines...)
+	expectedDeletedMachines := sets.Set[string]{}.Insert(input.ExpectedDeletedMachines...)
+	allMachines := sets.Set[string]{}
+	newMachines := sets.Set[string]{}
+	machineList := &clusterv1.MachineList{}
+
+	// Waits for the desired set of machines to exist.
+	log.Logf("Waiting for %d machines, must have %s, must not have %s", input.ExpectedReplicas, expectedOldMachines.UnsortedList(), expectedDeletedMachines.UnsortedList())
+	Eventually(func(g Gomega) {
+		// Gets the list of machines
+		g.Expect(input.Lister.List(ctx, machineList, inClustersNamespaceListOption, matchClusterListOption)).To(Succeed())
+
+		allMachines = sets.Set[string]{}
+		for i := range machineList.Items {
+			allMachines.Insert(machineList.Items[i].Name)
+		}
+
+		// Compute new machines (all - old - to be deleted)
+		newMachines = allMachines.Clone()
+		newMachines.Delete(expectedOldMachines.UnsortedList()...)
+		newMachines.Delete(expectedDeletedMachines.UnsortedList()...)
+
+		log.Logf(" - expected %d, got %d: %s, of which new %s, must have check: %t, must not have check: %t", input.ExpectedReplicas, allMachines.Len(), allMachines.UnsortedList(), newMachines.UnsortedList(), allMachines.HasAll(expectedOldMachines.UnsortedList()...), !allMachines.HasAny(expectedDeletedMachines.UnsortedList()...))
+
+		// Ensures all the expected old machines are still there.
+		g.Expect(allMachines.HasAll(expectedOldMachines.UnsortedList()...)).To(BeTrue(),
+			"Got machines: %s, must contain all of: %s", allMachines.UnsortedList(), expectedOldMachines.UnsortedList())
+
+		// Ensures none of the machines to be deleted is still there.
+		g.Expect(!allMachines.HasAny(expectedDeletedMachines.UnsortedList()...)).To(BeTrue(),
+			"Got machines: %s, must not contain any of: %s", allMachines.UnsortedList(), expectedDeletedMachines.UnsortedList())
+
+		g.Expect(allMachines).To(HaveLen(input.ExpectedReplicas), "Got %d machines, must be %d", len(allMachines), input.ExpectedReplicas)
+	}, input.WaitForMachinesIntervals...).Should(Succeed(),
+		"Failed to get the expected list of machines: got %s (expected %d machines, must have %s, must not have %s)",
+		allMachines.UnsortedList(), input.ExpectedReplicas, expectedOldMachines.UnsortedList(), expectedDeletedMachines.UnsortedList())
+	log.Logf("Got %d machines: %s", allMachines.Len(), allMachines.UnsortedList())
+
+	// Ensures the desired set of machines is stable (no further machines are created or deleted).
+	log.Logf("Checking the list of machines is stable")
+	allMachinesNow := sets.Set[string]{}
+	Consistently(func() bool {
+		// Gets the list of machines
+		if err := input.Lister.List(ctx, machineList, inClustersNamespaceListOption, matchClusterListOption); err != nil {
+			return false
+		}
+		allMachinesNow = sets.Set[string]{}
+		for i := range machineList.Items {
+			allMachinesNow.Insert(machineList.Items[i].Name)
+		}
+
+		return allMachines.Equal(allMachinesNow)
+	}, input.CheckMachineListStableIntervals...).Should(BeTrue(), "Expected list of machines is not stable: got %s, expected %s", allMachinesNow.UnsortedList(), allMachines.UnsortedList())
+
+	return allMachines.UnsortedList(), newMachines.UnsortedList()
+}
+
+// getServerAddr returns the address to be used for accessing the management cluster from a workload cluster.
+func getServerAddr(ctx context.Context, clusterProxy framework.ClusterProxy) string {
+	// With CAPD, we can't just access the bootstrap cluster via 127.0.0.1:<port> from the
+	// workload cluster. Instead we retrieve the server name from the cluster-info ConfigMap in the bootstrap
+	// cluster (e.g. "https://test-z45p9k-control-plane:6443")
+	// Note: This has been tested with MacOS,Linux and Prow.
+	clusterInfoCM := &corev1.ConfigMap{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "cluster-info",
+			Namespace: metav1.NamespacePublic,
+		},
+	}
+	Expect(clusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(clusterInfoCM), clusterInfoCM)).To(Succeed())
+	Expect(clusterInfoCM.Data).To(HaveKey("kubeconfig"))
+
+	kubeConfigString := clusterInfoCM.Data["kubeconfig"]
+
+	kubeConfig, err := clientcmd.Load([]byte(kubeConfigString))
+	Expect(err).ToNot(HaveOccurred())
+
+	return kubeConfig.Clusters[""].Server
+}
+
+// getAuthenticationToken returns a bearer authenticationToken with minimal RBAC permissions to access the mhc-test ConfigMap that will be used
+// to control machines bootstrap during the remediation tests.
+func getAuthenticationToken(ctx context.Context, managementClusterProxy framework.ClusterProxy, namespace string) string {
+	sa := &corev1.ServiceAccount{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "mhc-test",
+			Namespace: namespace,
+		},
+	}
+	Expect(managementClusterProxy.GetClient().Create(ctx, sa)).To(Succeed(), "failed to create mhc-test service account")
+
+	role := &rbacv1.Role{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "mhc-test",
+			Namespace: namespace,
+		},
+		Rules: []rbacv1.PolicyRule{
+			{
+				Verbs:         []string{"get", "list", "patch"},
+				APIGroups:     []string{""},
+				Resources:     []string{"configmaps"},
+				ResourceNames: []string{"mhc-test"},
+			},
+		},
+	}
+	Expect(managementClusterProxy.GetClient().Create(ctx, role)).To(Succeed(), "failed to create mhc-test role")
+
+	roleBinding := &rbacv1.RoleBinding{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "mhc-test",
+			Namespace: namespace,
+		},
+		Subjects: []rbacv1.Subject{
+			{
+				Kind:      "ServiceAccount",
+				APIGroup:  "",
+				Name:      "mhc-test",
+				Namespace: namespace,
+			},
+		},
+		RoleRef: rbacv1.RoleRef{
+			APIGroup: rbacv1.SchemeGroupVersion.Group,
+			Kind:     "Role",
+			Name:     "mhc-test",
+		},
+	}
+	Expect(managementClusterProxy.GetClient().Create(ctx, roleBinding)).To(Succeed(), "failed to create mhc-test role binding")
+
+	tokenRequest := &authenticationv1.TokenRequest{
+		Spec: authenticationv1.TokenRequestSpec{
+			ExpirationSeconds: pointer.Int64(2 * 60 * 60), // 2 hours.
+		},
+	}
+	tokenRequest, err := managementClusterProxy.GetClientSet().CoreV1().ServiceAccounts(namespace).CreateToken(ctx, "mhc-test", tokenRequest, metav1.CreateOptions{})
+	Expect(err).ToNot(HaveOccurred())
+
+	return tokenRequest.Status.Token
+}
diff --git a/test/e2e/mhc_remediations_test.go b/test/e2e/kcp_remediations_test.go
similarity index 83%
rename from test/e2e/mhc_remediations_test.go
rename to test/e2e/kcp_remediations_test.go
index 66471901d939..cdd9fd50efc4 100644
--- a/test/e2e/mhc_remediations_test.go
+++ b/test/e2e/kcp_remediations_test.go
@@ -23,9 +23,9 @@ import (
 	. "github.com/onsi/ginkgo/v2"
 )
 
-var _ = Describe("When testing unhealthy machines remediation", func() {
-	MachineRemediationSpec(ctx, func() MachineRemediationSpecInput {
-		return MachineRemediationSpecInput{
+var _ = Describe("When testing KCP remediation", func() {
+	KCPRemediationSpec(ctx, func() KCPRemediationSpecInput {
+		return KCPRemediationSpecInput{
 			E2EConfig:             e2eConfig,
 			ClusterctlConfigPath:  clusterctlConfigPath,
 			BootstrapClusterProxy: bootstrapClusterProxy,
diff --git a/test/e2e/mhc_remediations.go b/test/e2e/md_remediations.go
similarity index 61%
rename from test/e2e/mhc_remediations.go
rename to test/e2e/md_remediations.go
index b1c3989d4f61..7d95a032518f 100644
--- a/test/e2e/mhc_remediations.go
+++ b/test/e2e/md_remediations.go
@@ -32,8 +32,8 @@ import (
 	"sigs.k8s.io/cluster-api/util"
 )
 
-// MachineRemediationSpecInput is the input for MachineRemediationSpec.
-type MachineRemediationSpecInput struct {
+// MachineDeploymentRemediationSpecInput is the input for MachineDeploymentRemediationSpec.
+type MachineDeploymentRemediationSpecInput struct {
 	E2EConfig             *clusterctl.E2EConfig
 	ClusterctlConfigPath  string
 	BootstrapClusterProxy framework.ClusterProxy
@@ -41,26 +41,19 @@ type MachineRemediationSpecInput struct {
 	SkipCleanup           bool
 	ControlPlaneWaiters   clusterctl.ControlPlaneWaiters
 
-	// KCPFlavor, if specified, must refer to a template that has a MachineHealthCheck
-	// resource configured to match the control plane Machines (cluster.x-k8s.io/controlplane: "" label)
-	// and be configured to treat "e2e.remediation.condition" "False" as an unhealthy
-	// condition with a short timeout.
-	// If not specified, "kcp-remediation" is used.
-	KCPFlavor *string
-
-	// MDFlavor, if specified, must refer to a template that has a MachineHealthCheck
+	// Flavor, if specified, must refer to a template that has a MachineHealthCheck
 	// resource configured to match the MachineDeployment managed Machines and be
 	// configured to treat "e2e.remediation.condition" "False" as an unhealthy
 	// condition with a short timeout.
 	// If not specified, "md-remediation" is used.
-	MDFlavor *string
+	Flavor *string
 }
 
-// MachineRemediationSpec implements a test that verifies that Machines are remediated by MHC during unhealthy conditions.
-func MachineRemediationSpec(ctx context.Context, inputGetter func() MachineRemediationSpecInput) {
+// MachineDeploymentRemediationSpec implements a test that verifies that Machines are remediated by MHC during unhealthy conditions.
+func MachineDeploymentRemediationSpec(ctx context.Context, inputGetter func() MachineDeploymentRemediationSpecInput) {
 	var (
-		specName         = "mhc-remediation"
-		input            MachineRemediationSpecInput
+		specName         = "md-remediation"
+		input            MachineDeploymentRemediationSpecInput
 		namespace        *corev1.Namespace
 		cancelWatches    context.CancelFunc
 		clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
@@ -80,7 +73,7 @@ func MachineRemediationSpec(ctx context.Context, inputGetter func() MachineRemed
 		clusterResources = new(clusterctl.ApplyClusterTemplateAndWaitResult)
 	})
 
-	It("Should successfully trigger machine deployment remediation", func() {
+	It("Should replace unhealthy machines", func() {
 		By("Creating a workload cluster")
 
 		clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{
@@ -90,7 +83,7 @@ func MachineRemediationSpec(ctx context.Context, inputGetter func() MachineRemed
 				ClusterctlConfigPath:     input.ClusterctlConfigPath,
 				KubeconfigPath:           input.BootstrapClusterProxy.GetKubeconfigPath(),
 				InfrastructureProvider:   clusterctl.DefaultInfrastructureProvider,
-				Flavor:                   pointer.StringDeref(input.MDFlavor, "md-remediation"),
+				Flavor:                   pointer.StringDeref(input.Flavor, "md-remediation"),
 				Namespace:                namespace.Name,
 				ClusterName:              fmt.Sprintf("%s-%s", specName, util.RandomString(6)),
 				KubernetesVersion:        input.E2EConfig.GetVariable(KubernetesVersion),
@@ -103,6 +96,8 @@ func MachineRemediationSpec(ctx context.Context, inputGetter func() MachineRemed
 			WaitForMachineDeployments:    input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
 		}, clusterResources)
 
+		// TODO: this should be re-written like the KCP remediation test, because the current implementation
+		//   only tests that MHC applies the unhealthy condition but it doesn't test that the unhealthy machine is delete and a replacement machine comes up.
 		By("Setting a machine unhealthy and wait for MachineDeployment remediation")
 		framework.DiscoverMachineHealthChecksAndWaitForRemediation(ctx, framework.DiscoverMachineHealthCheckAndWaitForRemediationInput{
 			ClusterProxy:              input.BootstrapClusterProxy,
@@ -113,39 +108,6 @@ func MachineRemediationSpec(ctx context.Context, inputGetter func() MachineRemed
 		By("PASSED!")
 	})
 
-	It("Should successfully trigger KCP remediation", func() {
-		By("Creating a workload cluster")
-
-		clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{
-			ClusterProxy: input.BootstrapClusterProxy,
-			ConfigCluster: clusterctl.ConfigClusterInput{
-				LogFolder:                filepath.Join(input.ArtifactFolder, "clusters", input.BootstrapClusterProxy.GetName()),
-				ClusterctlConfigPath:     input.ClusterctlConfigPath,
-				KubeconfigPath:           input.BootstrapClusterProxy.GetKubeconfigPath(),
-				InfrastructureProvider:   clusterctl.DefaultInfrastructureProvider,
-				Flavor:                   pointer.StringDeref(input.KCPFlavor, "kcp-remediation"),
-				Namespace:                namespace.Name,
-				ClusterName:              fmt.Sprintf("%s-%s", specName, util.RandomString(6)),
-				KubernetesVersion:        input.E2EConfig.GetVariable(KubernetesVersion),
-				ControlPlaneMachineCount: pointer.Int64(3),
-				WorkerMachineCount:       pointer.Int64(1),
-			},
-			ControlPlaneWaiters:          input.ControlPlaneWaiters,
-			WaitForClusterIntervals:      input.E2EConfig.GetIntervals(specName, "wait-cluster"),
-			WaitForControlPlaneIntervals: input.E2EConfig.GetIntervals(specName, "wait-control-plane"),
-			WaitForMachineDeployments:    input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
-		}, clusterResources)
-
-		By("Setting a machine unhealthy and wait for KubeadmControlPlane remediation")
-		framework.DiscoverMachineHealthChecksAndWaitForRemediation(ctx, framework.DiscoverMachineHealthCheckAndWaitForRemediationInput{
-			ClusterProxy:              input.BootstrapClusterProxy,
-			Cluster:                   clusterResources.Cluster,
-			WaitForMachineRemediation: input.E2EConfig.GetIntervals(specName, "wait-machine-remediation"),
-		})
-
-		By("PASSED!")
-	})
-
 	AfterEach(func() {
 		// Dumps all the resources in the spec namespace, then cleanups the cluster object and the spec namespace itself.
 		dumpSpecResourcesAndCleanup(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder, namespace, cancelWatches, clusterResources.Cluster, input.E2EConfig.GetIntervals, input.SkipCleanup)
diff --git a/test/e2e/md_remediations_test.go b/test/e2e/md_remediations_test.go
new file mode 100644
index 000000000000..c35b60938d82
--- /dev/null
+++ b/test/e2e/md_remediations_test.go
@@ -0,0 +1,36 @@
+//go:build e2e
+// +build e2e
+
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+)
+
+var _ = Describe("When testing MachineDeployment remediation", func() {
+	MachineDeploymentRemediationSpec(ctx, func() MachineDeploymentRemediationSpecInput {
+		return MachineDeploymentRemediationSpecInput{
+			E2EConfig:             e2eConfig,
+			ClusterctlConfigPath:  clusterctlConfigPath,
+			BootstrapClusterProxy: bootstrapClusterProxy,
+			ArtifactFolder:        artifactFolder,
+			SkipCleanup:           skipCleanup,
+		}
+	})
+})
diff --git a/test/infrastructure/container/docker.go b/test/infrastructure/container/docker.go
index 204ec75ee7a3..a790f881b782 100644
--- a/test/infrastructure/container/docker.go
+++ b/test/infrastructure/container/docker.go
@@ -161,8 +161,7 @@ func (d *dockerRuntime) GetHostPort(ctx context.Context, containerName, portAndP
 }
 
 // ExecContainer executes a command in a running container and writes any output to the provided writer.
-func (d *dockerRuntime) ExecContainer(_ context.Context, containerName string, config *ExecContainerInput, command string, args ...string) error {
-	ctx := context.Background() // Let the command finish, even if it takes longer than the default timeout
+func (d *dockerRuntime) ExecContainer(ctx context.Context, containerName string, config *ExecContainerInput, command string, args ...string) error {
 	execConfig := types.ExecConfig{
 		// Run with privileges so we can remount etc..
 		// This might not make sense in the most general sense, but it is
diff --git a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go
index edc90b4232ee..7920065ae40f 100644
--- a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go
+++ b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go
@@ -305,10 +305,31 @@ func (r *DockerMachineReconciler) reconcileNormal(ctx context.Context, cluster *
 		if err := externalMachine.CheckForBootstrapSuccess(timeoutCtx, false); err != nil {
 			bootstrapData, format, err := r.getBootstrapData(timeoutCtx, machine)
 			if err != nil {
-				log.Error(err, "failed to get bootstrap data")
 				return ctrl.Result{}, err
 			}
 
+			// Setup a go routing to check for the machine being deleted while running bootstrap as a
+			// synchronous process, e.g. due to remediation. The routine stops when timeoutCtx is Done
+			// (either because canceled intentionally due to machine deletion or canceled by the defer cancel()
+			// call when exiting from this func).
+			go func() {
+				for {
+					select {
+					case <-timeoutCtx.Done():
+						return
+					default:
+						updatedDockerMachine := &infrav1.DockerMachine{}
+						if err := r.Client.Get(ctx, client.ObjectKeyFromObject(dockerMachine), updatedDockerMachine); err == nil &&
+							!updatedDockerMachine.DeletionTimestamp.IsZero() {
+							log.Info("Cancelling Bootstrap because the underlying machine has been deleted")
+							cancel()
+							return
+						}
+						time.Sleep(5 * time.Second)
+					}
+				}
+			}()
+
 			// Run the bootstrap script. Simulates cloud-init/Ignition.
 			if err := externalMachine.ExecBootstrap(timeoutCtx, bootstrapData, format); err != nil {
 				conditions.MarkFalse(dockerMachine, infrav1.BootstrapExecSucceededCondition, infrav1.BootstrapFailedReason, clusterv1.ConditionSeverityWarning, "Repeating bootstrap")