Skip to content

Commit

Permalink
skip remediation for paused machines and those marked for skipping
Browse files Browse the repository at this point in the history
  • Loading branch information
shysank committed Feb 10, 2021
1 parent 4a6cccd commit dbf553b
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 9 deletions.
3 changes: 3 additions & 0 deletions api/v1alpha4/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ const (
// that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation.
TemplateClonedFromGroupKindAnnotation = "cluster.x-k8s.io/cloned-from-groupkind"

// MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler.
MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation"

// ClusterSecretType defines the type of secret created by core components
ClusterSecretType corev1.SecretType = "cluster.x-k8s.io/secret" //nolint:gosec

Expand Down
2 changes: 1 addition & 1 deletion controllers/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log

// fetch all targets
logger.V(3).Info("Finding targets")
targets, err := r.getTargetsFromMHC(ctx, remoteClient, m)
targets, err := r.getTargetsFromMHC(ctx, logger, remoteClient, m)
if err != nil {
logger.Error(err, "Failed to fetch targets from MachineHealthCheck")
return ctrl.Result{}, err
Expand Down
23 changes: 22 additions & 1 deletion controllers/machinehealthcheck_targets.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4"
"sigs.k8s.io/cluster-api/util/annotations"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -159,7 +160,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi

// getTargetsFromMHC uses the MachineHealthCheck's selector to fetch machines
// and their nodes targeted by the health check, ready for health checking.
func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, clusterClient client.Reader, mhc *clusterv1.MachineHealthCheck) ([]healthCheckTarget, error) {
func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, logger logr.Logger, clusterClient client.Reader, mhc *clusterv1.MachineHealthCheck) ([]healthCheckTarget, error) {
machines, err := r.getMachinesFromMHC(ctx, mhc)
if err != nil {
return nil, errors.Wrap(err, "error getting machines from MachineHealthCheck")
Expand All @@ -170,6 +171,12 @@ func (r *MachineHealthCheckReconciler) getTargetsFromMHC(ctx context.Context, cl

targets := []healthCheckTarget{}
for k := range machines {
skip, reason := shouldSkipRemediation(&machines[k])
if skip {
logger.Info("skipping remediation", "machine", machines[k].Name, "reason", reason)
continue
}

patchHelper, err := patch.NewHelper(&machines[k], r.Client)
if err != nil {
return nil, errors.Wrap(err, "unable to initialize patch helper")
Expand Down Expand Up @@ -298,3 +305,17 @@ func minDuration(durations []time.Duration) time.Duration {
}
return minDuration
}

// shouldSkipRemediation checks if the machine should be skipped for remediation.
// Returns true if it should be skipped along with the reason for skipping.
func shouldSkipRemediation(m *clusterv1.Machine) (bool, string) {
if annotations.HasPausedAnnotation(m) {
return true, fmt.Sprintf("machine has %q annotation", clusterv1.PausedAnnotation)
}

if annotations.HasSkipRemediationAnnotation(m) {
return true, fmt.Sprintf("machine has %q annotation", clusterv1.MachineSkipRemediationAnnotation)
}

return false, ""
}
21 changes: 20 additions & 1 deletion controllers/machinehealthcheck_targets_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ func TestGetTargetsFromMHC(t *testing.T) {
testNode4 := newTestNode("node4")
testMachine4 := newTestMachine("machine4", namespace, "other-cluster", testNode4.Name, mhcSelector)

// machines for skip remediation
testNode5 := newTestNode("node5")
testMachine5 := newTestMachine("machine5", namespace, clusterName, testNode5.Name, mhcSelector)
testMachine5.Annotations = map[string]string{"cluster.x-k8s.io/skip-remediation": ""}
testNode6 := newTestNode("node6")
testMachine6 := newTestMachine("machine6", namespace, clusterName, testNode6.Name, mhcSelector)
testMachine6.Annotations = map[string]string{"cluster.x-k8s.io/paused": ""}

testCases := []struct {
desc string
toCreate []client.Object
Expand Down Expand Up @@ -124,6 +132,17 @@ func TestGetTargetsFromMHC(t *testing.T) {
},
},
},
{
desc: "with machines having skip-remediation or paused annotation",
toCreate: append(baseObjects, testNode1, testMachine1, testMachine5, testMachine6),
expectedTargets: []healthCheckTarget{
{
Machine: testMachine1,
MHC: testMHC,
Node: testNode1,
},
},
},
}

for _, tc := range testCases {
Expand All @@ -143,7 +162,7 @@ func TestGetTargetsFromMHC(t *testing.T) {
t.patchHelper = patchHelper
}

targets, err := reconciler.getTargetsFromMHC(ctx, k8sClient, testMHC)
targets, err := reconciler.getTargetsFromMHC(ctx, ctrl.LoggerFrom(ctx), k8sClient, testMHC)
gs.Expect(err).ToNot(HaveOccurred())

gs.Expect(len(targets)).To(Equal(len(tc.expectedTargets)))
Expand Down
12 changes: 12 additions & 0 deletions docs/book/src/tasks/healthcheck.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,18 @@ If `maxUnhealthy` is set to `40%` and there are 6 Machines being checked:

Note, when the percentage is not a whole number, the allowed number is rounded down.

## Skipping Remediation

There are scenarios where remediation for a machine may be undesirable (eg. during cluster migration using `clustrctl move`). For such cases, MachineHealthCheck provides 2 mechanisms to skip machines for remediation.

Implicit skipping when the resource is paused (using `cluster.x-k8s.io/paused` annotation):
- When a cluster is paused, none of the machines in that cluster are considered for remediation.
- When a machine is paused, only that machine is not considered for remediation.
- A cluster or a machine is usually paused automatically by cluster api when it detects a migration.

Explicit skipping using `cluster.x-k8s.io/skip-remediation` annotation:
- Users can also skip any machine for remediation by setting the `cluster.x-k8s.io/skip-remediation` for that machine.

## Limitations and Caveats of a MachineHealthCheck

Before deploying a MachineHealthCheck, please familiarise yourself with the following limitations and caveats:
Expand Down
22 changes: 16 additions & 6 deletions util/annotations/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ func IsPaused(cluster *clusterv1.Cluster, o metav1.Object) bool {

// HasPausedAnnotation returns true if the object has the `paused` annotation.
func HasPausedAnnotation(o metav1.Object) bool {
annotations := o.GetAnnotations()
if annotations == nil {
return false
}
_, ok := annotations[clusterv1.PausedAnnotation]
return ok
return hasAnnotation(o, clusterv1.PausedAnnotation)
}

// HasSkipRemediationAnnotation returns true if the object has the `skip-remediation` annotation.
func HasSkipRemediationAnnotation(o metav1.Object) bool {
return hasAnnotation(o, clusterv1.MachineSkipRemediationAnnotation)
}

func HasWithPrefix(prefix string, annotations map[string]string) bool {
Expand All @@ -65,3 +65,13 @@ func AddAnnotations(o metav1.Object, desired map[string]string) bool {
}
return hasChanged
}

// hasAnnotation returns true if the object has the specified annotation
func hasAnnotation(o metav1.Object, annotation string) bool {
annotations := o.GetAnnotations()
if annotations == nil {
return false
}
_, ok := annotations[annotation]
return ok
}

0 comments on commit dbf553b

Please sign in to comment.