From 63d56b25f94a3be2f4fe3b7a09ded8032c215183 Mon Sep 17 00:00:00 2001 From: Christian Schlotter Date: Mon, 26 Feb 2024 17:05:05 +0100 Subject: [PATCH] api: implement annotation to manually mark machines for remediation via MHC --- api/v1beta1/common_types.go | 3 +++ api/v1beta1/condition_consts.go | 4 ++++ .../src/reference/labels_and_annotations.md | 1 + .../machinehealthcheck_targets.go | 7 ++++++ .../machinehealthcheck_targets_test.go | 24 +++++++++++++++++-- util/annotations/helpers.go | 5 ++++ 6 files changed, 42 insertions(+), 2 deletions(-) diff --git a/api/v1beta1/common_types.go b/api/v1beta1/common_types.go index b25985f1eccb..91c12f32efb1 100644 --- a/api/v1beta1/common_types.go +++ b/api/v1beta1/common_types.go @@ -128,6 +128,9 @@ const ( // MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler. MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation" + // RemediateMachineAnnotation is the annotation used to mark machines that should be remediated by MachineHealthCheck reconciler. + RemediateMachineAnnotation = "cluster.x-k8s.io/remediate-machine" + // MachineSetSkipPreflightChecksAnnotation is the annotation used to provide a comma-separated list of // preflight checks that should be skipped during the MachineSet reconciliation. // Supported items are: diff --git a/api/v1beta1/condition_consts.go b/api/v1beta1/condition_consts.go index d2df705dc80d..57d8324c1864 100644 --- a/api/v1beta1/condition_consts.go +++ b/api/v1beta1/condition_consts.go @@ -152,6 +152,10 @@ const ( // MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status. MachineHasFailureReason = "MachineHasFailure" + // HasRemediateMachineAnnotationReason is the reason that get's set at the MachineHealthCheckSucceededCondition when a machine + // has the RemediateMachineAnnotation set. + HasRemediateMachineAnnotationReason = "HasRemediateMachineAnnotation" + // NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout. NodeStartupTimeoutReason = "NodeStartupTimeout" diff --git a/docs/book/src/reference/labels_and_annotations.md b/docs/book/src/reference/labels_and_annotations.md index 3d20686f853c..3f03795f1c8e 100644 --- a/docs/book/src/reference/labels_and_annotations.md +++ b/docs/book/src/reference/labels_and_annotations.md @@ -38,6 +38,7 @@ | cluster.x-k8s.io/cloned-from-name | It is the infrastructure machine annotation that stores the name of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. | | cluster.x-k8s.io/cloned-from-groupkind | It is the infrastructure machine annotation that stores the group-kind of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. | | cluster.x-k8s.io/skip-remediation | It is used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler. | +| cluster.x-k8s.io/remediate-machine | It can be applied to a machine to manually mark it for remediation by MachineHealthCheck reconciler. | | cluster.x-k8s.io/managed-by | It can be applied to InfraCluster resources to signify that some external system is managing the cluster infrastructure. Provider InfraCluster controllers will ignore resources with this annotation. An external controller must fulfill the contract of the InfraCluster resource. External infrastructure providers should ensure that the annotation, once set, cannot be removed. | | cluster.x-k8s.io/replicas-managed-by | It can be applied to MachinePool resources to signify that some external system is managing infrastructure scaling for that pool. See [the MachinePool documentation](../developer/architecture/controllers/machine-pool.md#externally-managed-autoscaler) for more details. | | cluster.x-k8s.io/skip-machineset-preflight-checks | It can be applied on MachineDeployment and MachineSet resources to specify a comma-separated list of preflight checks that should be skipped during MachineSet reconciliation. Supported preflight checks are: All, KubeadmVersionSkew, KubernetesVersionSkew, ControlPlaneIsStable. | diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go index b9ee058279f1..f7677ae4d08d 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go @@ -82,6 +82,7 @@ func (t *healthCheckTarget) nodeName() string { // Determine whether or not a given target needs remediation. // The node will need remediation if any of the following are true: +// - The Machine has the remediate machine annotation // - The Machine has failed for some reason // - The Machine did not get a node before `timeoutForMachineToHaveNode` elapses // - The Node has gone away @@ -93,6 +94,12 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi var nextCheckTimes []time.Duration now := time.Now() + if annotations.HasRemediateMachine(t.Machine) { + conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.HasRemediateMachineAnnotationReason, clusterv1.ConditionSeverityWarning, "Marked for remediation via remediate-machine annotation") + logger.V(3).Info("Target is marked for remediation via remediate-machine annotation") + return true, time.Duration(0) + } + if t.Machine.Status.FailureReason != nil { conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", *t.Machine.Status.FailureReason) logger.V(3).Info("Target is unhealthy", "failureReason", t.Machine.Status.FailureReason) diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go index abdc1bbc9563..3fdad0265d21 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go @@ -87,10 +87,10 @@ func TestGetTargetsFromMHC(t *testing.T) { // machines for skip remediation testNode5 := newTestNode("node5") testMachine5 := newTestMachine("machine5", namespace, clusterName, testNode5.Name, mhcSelector) - testMachine5.Annotations = map[string]string{"cluster.x-k8s.io/skip-remediation": ""} + testMachine5.Annotations = map[string]string{clusterv1.MachineSkipRemediationAnnotation: ""} testNode6 := newTestNode("node6") testMachine6 := newTestMachine("machine6", namespace, clusterName, testNode6.Name, mhcSelector) - testMachine6.Annotations = map[string]string{"cluster.x-k8s.io/paused": ""} + testMachine6.Annotations = map[string]string{clusterv1.PausedAnnotation: ""} testCases := []struct { desc string @@ -340,6 +340,18 @@ func TestHealthCheckTargets(t *testing.T) { } machineFailureMsgCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureMessage: %s", failureMsg) + // Target for when the machine has the remediate machine annotation + annotationRemediationMsg := "Marked for remediation via remediate-machine annotation" + testMachineAnnotationRemediation := testMachine.DeepCopy() + testMachineAnnotationRemediation.Annotations = map[string]string{clusterv1.RemediateMachineAnnotation: ""} + machineAnnotationRemediation := healthCheckTarget{ + Cluster: cluster, + MHC: testMHC, + Machine: testMachineAnnotationRemediation, + Node: nil, + } + machineAnnotationRemediationCondition := newFailedHealthCheckCondition(clusterv1.HasRemediateMachineAnnotationReason, annotationRemediationMsg) + testCases := []struct { desc string targets []healthCheckTarget @@ -426,6 +438,14 @@ func TestHealthCheckTargets(t *testing.T) { expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureMsgCondition}, expectedNextCheckTimes: []time.Duration{}, }, + { + desc: "when the machine is manually marked for remediation", + targets: []healthCheckTarget{machineAnnotationRemediation}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{machineAnnotationRemediation}, + expectedNeedsRemediationCondition: []clusterv1.Condition{machineAnnotationRemediationCondition}, + expectedNextCheckTimes: []time.Duration{}, + }, } for _, tc := range testCases { diff --git a/util/annotations/helpers.go b/util/annotations/helpers.go index 47dc7fc6b77b..e4990032b459 100644 --- a/util/annotations/helpers.go +++ b/util/annotations/helpers.go @@ -48,6 +48,11 @@ func HasSkipRemediation(o metav1.Object) bool { return hasAnnotation(o, clusterv1.MachineSkipRemediationAnnotation) } +// HasRemediateMachine returns true if the object has the `remediate-machine` annotation. +func HasRemediateMachine(o metav1.Object) bool { + return hasAnnotation(o, clusterv1.RemediateMachineAnnotation) +} + // HasWithPrefix returns true if at least one of the annotations has the prefix specified. func HasWithPrefix(prefix string, annotations map[string]string) bool { for key := range annotations {