diff --git a/api/v1alpha4/common_types.go b/api/v1alpha4/common_types.go index 2062ff8e734f..576c1d312222 100644 --- a/api/v1alpha4/common_types.go +++ b/api/v1alpha4/common_types.go @@ -71,6 +71,9 @@ const ( // that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. TemplateClonedFromGroupKindAnnotation = "cluster.x-k8s.io/cloned-from-groupkind" + // MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler. + MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation" + // ClusterSecretType defines the type of secret created by core components ClusterSecretType corev1.SecretType = "cluster.x-k8s.io/secret" //nolint:gosec diff --git a/controllers/machinehealthcheck_controller.go b/controllers/machinehealthcheck_controller.go index 935bd8a28e75..c5e145831ee9 100644 --- a/controllers/machinehealthcheck_controller.go +++ b/controllers/machinehealthcheck_controller.go @@ -194,7 +194,7 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log // fetch all targets logger.V(3).Info("Finding targets") - targets, err := r.getTargetsFromMHC(ctx, remoteClient, m) + targets, err := r.getTargetsFromMHC(ctx, logger, remoteClient, m) if err != nil { logger.Error(err, "Failed to fetch targets from MachineHealthCheck") return ctrl.Result{}, err diff --git a/controllers/machinehealthcheck_targets.go b/controllers/machinehealthcheck_targets.go index 3de56dbed53b..5493b54ef2e7 100644 --- a/controllers/machinehealthcheck_targets.go +++ b/controllers/machinehealthcheck_targets.go @@ -28,6 +28,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4" + "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/controller-runtime/pkg/client" @@ -159,7 +160,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi // getTargetsFromMHC uses the MachineHealthCheck's selector to fetch machines // and their nodes targeted by the health check, ready for health checking. -func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, clusterClient client.Reader, mhc *clusterv1.MachineHealthCheck) ([]healthCheckTarget, error) { +func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, logger logr.Logger, clusterClient client.Reader, mhc *clusterv1.MachineHealthCheck) ([]healthCheckTarget, error) { machines, err := r.getMachinesFromMHC(ctx, mhc) if err != nil { return nil, errors.Wrap(err, "error getting machines from MachineHealthCheck") @@ -170,6 +171,12 @@ func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, cl targets := []healthCheckTarget{} for k := range machines { + skip, reason := shouldSkipRemediation(&machines[k]) + if skip { + logger.Info("skipping remediation", "machine", machines[k].Name, "reason", reason) + continue + } + patchHelper, err := patch.NewHelper(&machines[k], r.Client) if err != nil { return nil, errors.Wrap(err, "unable to initialize patch helper") @@ -298,3 +305,17 @@ func minDuration(durations []time.Duration) time.Duration { } return minDuration } + +// shouldSkipRemediation checks if the machine should be skipped for remediation. +// Returns true if it should be skipped along with the reason for skipping. +func shouldSkipRemediation(m *clusterv1.Machine) (bool, string) { + if annotations.HasPausedAnnotation(m) { + return true, fmt.Sprintf("machine has %q annotation", clusterv1.PausedAnnotation) + } + + if annotations.HasSkipRemediationAnnotation(m) { + return true, fmt.Sprintf("machine has %q annotation", clusterv1.MachineSkipRemediationAnnotation) + } + + return false, "" +} diff --git a/controllers/machinehealthcheck_targets_test.go b/controllers/machinehealthcheck_targets_test.go index caea281d0da6..a0a8cb9d0b33 100644 --- a/controllers/machinehealthcheck_targets_test.go +++ b/controllers/machinehealthcheck_targets_test.go @@ -75,6 +75,14 @@ func TestGetTargetsFromMHC(t *testing.T) { testNode4 := newTestNode("node4") testMachine4 := newTestMachine("machine4", namespace, "other-cluster", testNode4.Name, mhcSelector) + // machines for skip remediation + testNode5 := newTestNode("node5") + testMachine5 := newTestMachine("machine5", namespace, clusterName, testNode5.Name, mhcSelector) + testMachine5.Annotations = map[string]string{"cluster.x-k8s.io/skip-remediation": ""} + testNode6 := newTestNode("node6") + testMachine6 := newTestMachine("machine6", namespace, clusterName, testNode6.Name, mhcSelector) + testMachine6.Annotations = map[string]string{"cluster.x-k8s.io/paused": ""} + testCases := []struct { desc string toCreate []client.Object @@ -124,6 +132,17 @@ func TestGetTargetsFromMHC(t *testing.T) { }, }, }, + { + desc: "with machines having skip-remediation or paused annotation", + toCreate: append(baseObjects, testNode1, testMachine1, testMachine5, testMachine6), + expectedTargets: []healthCheckTarget{ + { + Machine: testMachine1, + MHC: testMHC, + Node: testNode1, + }, + }, + }, } for _, tc := range testCases { @@ -143,7 +162,7 @@ func TestGetTargetsFromMHC(t *testing.T) { t.patchHelper = patchHelper } - targets, err := reconciler.getTargetsFromMHC(ctx, k8sClient, testMHC) + targets, err := reconciler.getTargetsFromMHC(ctx, ctrl.LoggerFrom(ctx), k8sClient, testMHC) gs.Expect(err).ToNot(HaveOccurred()) gs.Expect(len(targets)).To(Equal(len(tc.expectedTargets))) diff --git a/docs/book/src/tasks/healthcheck.md b/docs/book/src/tasks/healthcheck.md index af86c175fbd8..50c10647f6be 100644 --- a/docs/book/src/tasks/healthcheck.md +++ b/docs/book/src/tasks/healthcheck.md @@ -100,6 +100,18 @@ If `maxUnhealthy` is set to `40%` and there are 6 Machines being checked: Note, when the percentage is not a whole number, the allowed number is rounded down. +## Skipping Remediation + +There are scenarios where remediation for a machine may be undesirable (eg. during cluster migration using `clustrctl move`). For such cases, MachineHealthCheck provides 2 mechanisms to skip machines for remediation. + +Implicit skipping when the resource is paused (using `cluster.x-k8s.io/paused` annotation): +- When a cluster is paused, none of the machines in that cluster are considered for remediation. +- When a machine is paused, only that machine is not considered for remediation. +- A cluster or a machine is usually paused automatically by cluster api when it detects a migration. + +Explicit skipping using `cluster.x-k8s.io/skip-remediation` annotation: +- Users can also skip any machine for remediation by setting the `cluster.x-k8s.io/skip-remediation` for that machine. + ## Limitations and Caveats of a MachineHealthCheck Before deploying a MachineHealthCheck, please familiarise yourself with the following limitations and caveats: diff --git a/util/annotations/helpers.go b/util/annotations/helpers.go index 7674c3fcd38d..71b96b3309b8 100644 --- a/util/annotations/helpers.go +++ b/util/annotations/helpers.go @@ -33,12 +33,12 @@ func IsPaused(cluster *clusterv1.Cluster, o metav1.Object) bool { // HasPausedAnnotation returns true if the object has the `paused` annotation. func HasPausedAnnotation(o metav1.Object) bool { - annotations := o.GetAnnotations() - if annotations == nil { - return false - } - _, ok := annotations[clusterv1.PausedAnnotation] - return ok + return hasAnnotation(o, clusterv1.PausedAnnotation) +} + +// HasSkipRemediationAnnotation returns true if the object has the `skip-remediation` annotation. +func HasSkipRemediationAnnotation(o metav1.Object) bool { + return hasAnnotation(o, clusterv1.MachineSkipRemediationAnnotation) } func HasWithPrefix(prefix string, annotations map[string]string) bool { @@ -65,3 +65,13 @@ func AddAnnotations(o metav1.Object, desired map[string]string) bool { } return hasChanged } + +// hasAnnotation returns true if the object has the specified annotation +func hasAnnotation(o metav1.Object, annotation string) bool { + annotations := o.GetAnnotations() + if annotations == nil { + return false + } + _, ok := annotations[annotation] + return ok +}