Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Add support to skip Machine remediation, and respect paused Machines in MHC #4168

Merged
merged 1 commit into from
Feb 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions api/v1alpha4/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ const (
// that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation.
TemplateClonedFromGroupKindAnnotation = "cluster.x-k8s.io/cloned-from-groupkind"

// MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler.
MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation"

// ClusterSecretType defines the type of secret created by core components
ClusterSecretType corev1.SecretType = "cluster.x-k8s.io/secret" //nolint:gosec

Expand Down
2 changes: 1 addition & 1 deletion controllers/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log

// fetch all targets
logger.V(3).Info("Finding targets")
targets, err := r.getTargetsFromMHC(ctx, remoteClient, m)
targets, err := r.getTargetsFromMHC(ctx, logger, remoteClient, m)
if err != nil {
logger.Error(err, "Failed to fetch targets from MachineHealthCheck")
return ctrl.Result{}, err
Expand Down
23 changes: 22 additions & 1 deletion controllers/machinehealthcheck_targets.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4"
"sigs.k8s.io/cluster-api/util/annotations"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -159,7 +160,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi

// getTargetsFromMHC uses the MachineHealthCheck's selector to fetch machines
// and their nodes targeted by the health check, ready for health checking.
func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, clusterClient client.Reader, mhc *clusterv1.MachineHealthCheck) ([]healthCheckTarget, error) {
func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, logger logr.Logger, clusterClient client.Reader, mhc *clusterv1.MachineHealthCheck) ([]healthCheckTarget, error) {
machines, err := r.getMachinesFromMHC(ctx, mhc)
if err != nil {
return nil, errors.Wrap(err, "error getting machines from MachineHealthCheck")
Expand All @@ -170,6 +171,12 @@ func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, cl

targets := []healthCheckTarget{}
for k := range machines {
skip, reason := shouldSkipRemediation(&machines[k])
if skip {
logger.Info("skipping remediation", "machine", machines[k].Name, "reason", reason)
continue
}

patchHelper, err := patch.NewHelper(&machines[k], r.Client)
if err != nil {
return nil, errors.Wrap(err, "unable to initialize patch helper")
Expand Down Expand Up @@ -298,3 +305,17 @@ func minDuration(durations []time.Duration) time.Duration {
}
return minDuration
}

// shouldSkipRemediation checks if the machine should be skipped for remediation.
// Returns true if it should be skipped along with the reason for skipping.
func shouldSkipRemediation(m *clusterv1.Machine) (bool, string) {
if annotations.HasPausedAnnotation(m) {
return true, fmt.Sprintf("machine has %q annotation", clusterv1.PausedAnnotation)
}

if annotations.HasSkipRemediationAnnotation(m) {
return true, fmt.Sprintf("machine has %q annotation", clusterv1.MachineSkipRemediationAnnotation)
}

return false, ""
}
21 changes: 20 additions & 1 deletion controllers/machinehealthcheck_targets_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ func TestGetTargetsFromMHC(t *testing.T) {
testNode4 := newTestNode("node4")
testMachine4 := newTestMachine("machine4", namespace, "other-cluster", testNode4.Name, mhcSelector)

// machines for skip remediation
testNode5 := newTestNode("node5")
testMachine5 := newTestMachine("machine5", namespace, clusterName, testNode5.Name, mhcSelector)
testMachine5.Annotations = map[string]string{"cluster.x-k8s.io/skip-remediation": ""}
testNode6 := newTestNode("node6")
testMachine6 := newTestMachine("machine6", namespace, clusterName, testNode6.Name, mhcSelector)
testMachine6.Annotations = map[string]string{"cluster.x-k8s.io/paused": ""}

testCases := []struct {
desc string
toCreate []client.Object
Expand Down Expand Up @@ -124,6 +132,17 @@ func TestGetTargetsFromMHC(t *testing.T) {
},
},
},
{
desc: "with machines having skip-remediation or paused annotation",
toCreate: append(baseObjects, testNode1, testMachine1, testMachine5, testMachine6),
expectedTargets: []healthCheckTarget{
{
Machine: testMachine1,
MHC: testMHC,
Node: testNode1,
},
},
},
}

for _, tc := range testCases {
Expand All @@ -143,7 +162,7 @@ func TestGetTargetsFromMHC(t *testing.T) {
t.patchHelper = patchHelper
}

targets, err := reconciler.getTargetsFromMHC(ctx, k8sClient, testMHC)
targets, err := reconciler.getTargetsFromMHC(ctx, ctrl.LoggerFrom(ctx), k8sClient, testMHC)
gs.Expect(err).ToNot(HaveOccurred())

gs.Expect(len(targets)).To(Equal(len(tc.expectedTargets)))
Expand Down
12 changes: 12 additions & 0 deletions docs/book/src/tasks/healthcheck.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,18 @@ If `maxUnhealthy` is set to `40%` and there are 6 Machines being checked:

Note, when the percentage is not a whole number, the allowed number is rounded down.

## Skipping Remediation

There are scenarios where remediation for a machine may be undesirable (eg. during cluster migration using `clustrctl move`). For such cases, MachineHealthCheck provides 2 mechanisms to skip machines for remediation.

Implicit skipping when the resource is paused (using `cluster.x-k8s.io/paused` annotation):
- When a cluster is paused, none of the machines in that cluster are considered for remediation.
- When a machine is paused, only that machine is not considered for remediation.
- A cluster or a machine is usually paused automatically by cluster api when it detects a migration.

Explicit skipping using `cluster.x-k8s.io/skip-remediation` annotation:
- Users can also skip any machine for remediation by setting the `cluster.x-k8s.io/skip-remediation` for that machine.

## Limitations and Caveats of a MachineHealthCheck

Before deploying a MachineHealthCheck, please familiarise yourself with the following limitations and caveats:
Expand Down
22 changes: 16 additions & 6 deletions util/annotations/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ func IsPaused(cluster *clusterv1.Cluster, o metav1.Object) bool {

// HasPausedAnnotation returns true if the object has the `paused` annotation.
func HasPausedAnnotation(o metav1.Object) bool {
annotations := o.GetAnnotations()
if annotations == nil {
return false
}
_, ok := annotations[clusterv1.PausedAnnotation]
return ok
return hasAnnotation(o, clusterv1.PausedAnnotation)
}

// HasSkipRemediationAnnotation returns true if the object has the `skip-remediation` annotation.
func HasSkipRemediationAnnotation(o metav1.Object) bool {
return hasAnnotation(o, clusterv1.MachineSkipRemediationAnnotation)
}

func HasWithPrefix(prefix string, annotations map[string]string) bool {
Expand All @@ -65,3 +65,13 @@ func AddAnnotations(o metav1.Object, desired map[string]string) bool {
}
return hasChanged
}

// hasAnnotation returns true if the object has the specified annotation
func hasAnnotation(o metav1.Object, annotation string) bool {
annotations := o.GetAnnotations()
if annotations == nil {
return false
}
_, ok := annotations[annotation]
return ok
}