Skip to content

Commit

Permalink
misc changes
Browse files Browse the repository at this point in the history
  • Loading branch information
sbueringer committed Feb 8, 2023
1 parent 3e92331 commit 8bb56d0
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 41 deletions.
62 changes: 31 additions & 31 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,6 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
return ctrl.Result{}, kerrors.NewAggregate(errList)
}

// Returns if another remediation is in progress but the new machine is not yet created.
if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
return ctrl.Result{}, nil
}

// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
// and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation.
unhealthyMachines := controlPlane.UnhealthyMachines()
Expand All @@ -99,6 +94,14 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
return ctrl.Result{}, nil
}

log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized)

// Returns if another remediation is in progress but the new machine is not yet created.
if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
log.Info("Another remediation is already in progress. Skipping remediation.")
return ctrl.Result{}, nil
}

patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
if err != nil {
return ctrl.Result{}, err
Expand All @@ -118,7 +121,6 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C

// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.
log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized)

// Check if KCP is allowed to remediate considering retry limits:
// - Remediation cannot happen because retryPeriod is not yet expired.
Expand Down Expand Up @@ -263,49 +265,39 @@ func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machin
return remediationInProgressData, true, nil
}

// Gets MinHealthySeconds and RetryDelaySeconds from the remediation strategy, or use defaults.
// Gets MinHealthyPeriod and RetryPeriod from the remediation strategy, or use defaults.
minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod
if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod != nil {
minHealthyPeriod = controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration
}

retryPeriod := time.Duration(0)
if controlPlane.KCP.Spec.RemediationStrategy != nil {
retryPeriod = controlPlane.KCP.Spec.RemediationStrategy.RetryPeriod.Duration
}

// Gets the timestamp of the last remediation; if missing, default to a value
// that ensures both MinHealthySeconds and RetryDelaySeconds are expired.
// that ensures both MinHealthyPeriod and RetryPeriod are expired.
// NOTE: this could potentially lead to executing more retries than expected or to executing retries before than
// expected, but this is considered acceptable when the system recovers from someone/something changes or deletes
// the RemediationForAnnotation on Machines.
max := func(x, y time.Duration) time.Duration {
if x < y {
return y
}
return x
}

lastRemediationTime := reconciliationTime.Add(-2 * max(minHealthyPeriod, retryPeriod))
if !lastRemediationData.Timestamp.IsZero() {
lastRemediationTime = lastRemediationData.Timestamp.Time
}

// Check if the machine being remediated has been created as a remediation for a previous unhealthy machine.
// NOTE: if someone/something changes or deletes the RemediationForAnnotation on Machines, this could potentially
// lead to executing more retries than expected, but this is considered acceptable in such a case.
machineRemediationFor := remediationInProgressData.Machine
if lastRemediationData.Machine != "" {
// If the remediation is happening before minHealthyPeriod is expired, then KCP considers this
// as a remediation for the same previously unhealthy machine.
if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) {
machineRemediationFor = lastRemediationData.Machine
log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, machineRemediationFor))
}
// Once we get here we already know that there was a last remediation for the Machine.
// If the current remediation is happening before minHealthyPeriod is expired, then KCP considers this
// as a remediation for the same previously unhealthy machine.
// NOTE: If someone/something changes the RemediationForAnnotation on Machines (e.g. changes the Timestamp),
// this could potentially lead to executing more retries than expected, but this is considered acceptable in such a case.
var retryForSameMachineInProgress bool
if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) {
retryForSameMachineInProgress = true
log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, lastRemediationData.Machine))
}

// If remediation is happening for a different machine, this is the first try of a new retry sequence.
if lastRemediationData.Machine != machineRemediationFor {
// If the retry for the same machine is not in progress, this is the first try of a new retry sequence.
if !retryForSameMachineInProgress {
return remediationInProgressData, true, nil
}

Expand All @@ -315,7 +307,7 @@ func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machin
// Check if remediation can happen because retryPeriod is passed.
if lastRemediationTime.Add(retryPeriod).After(reconciliationTime) {
log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation", retryPeriod))
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryDelay)", retryPeriod)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryPeriod)", retryPeriod)
return remediationInProgressData, false, nil
}

Expand All @@ -329,12 +321,20 @@ func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machin
}
}

// All the check passed, increase the remediation number.
// All the check passed, increase the remediation retry count.
remediationInProgressData.RetryCount++

return remediationInProgressData, true, nil
}

// max calculates the maximum duration.
func max(x, y time.Duration) time.Duration {
if x < y {
return y
}
return x
}

// canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
// without loosing etcd quorum.
//
Expand Down
14 changes: 8 additions & 6 deletions controlplane/kubeadm/internal/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
t.Run("reconcileUnhealthyMachines return early if another remediation is in progress", func(t *testing.T) {
g := NewWithT(t)

m := getDeletingMachine(ns.Name, "m1-unhealthy-deleting-", withMachineHealthCheckFailed())
m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation())
conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "")
conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
controlPlane := &internal.ControlPlane{
Expand Down Expand Up @@ -142,6 +142,8 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
}
ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)

g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))

g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
g.Expect(err).ToNot(HaveOccurred())
})
Expand Down Expand Up @@ -309,7 +311,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
removeFinalizer(g, m1)
g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
})
t.Run("Remediation does not happen if RetryDelay is not yet passed", func(t *testing.T) {
t.Run("Remediation does not happen if RetryPeriod is not yet passed", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
Expand All @@ -327,7 +329,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
Version: "v1.19.1",
RemediationStrategy: &controlplanev1.RemediationStrategy{
MaxRetry: utilpointer.Int32(3),
RetryPeriod: metav1.Duration{Duration: controlplanev1.DefaultMinHealthyPeriod}, // RetryDelaySeconds not yet expired.
RetryPeriod: metav1.Duration{Duration: controlplanev1.DefaultMinHealthyPeriod}, // RetryPeriod not yet expired.
},
},
},
Expand All @@ -352,7 +354,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) {

g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))

assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest 1h0m0s (RetryDelay)")
assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest 1h0m0s (RetryPeriod)")

err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
g.Expect(err).ToNot(HaveOccurred())
Expand Down Expand Up @@ -401,7 +403,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) {

g.Expect(env.Cleanup(ctx, m)).To(Succeed())
})
t.Run("Remediation does not happen if there is a deleting machine", func(t *testing.T) {
t.Run("Remediation does not happen if there is another machine being deleted (not the one to be remediated)", func(t *testing.T) {
g := NewWithT(t)

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
Expand Down Expand Up @@ -1146,7 +1148,7 @@ func TestReconcileUnhealthyMachinesSequences(t *testing.T) {

err = env.Get(ctx, client.ObjectKey{Namespace: m3.Namespace, Name: m3.Name}, m3)
g.Expect(err).ToNot(HaveOccurred())
g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
g.Expect(m3.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())

removeFinalizer(g, m3)
g.Expect(env.Cleanup(ctx, m3)).To(Succeed())
Expand Down
7 changes: 3 additions & 4 deletions test/e2e/kcp_remediations.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ func KCPRemediationSpec(ctx context.Context, inputGetter func() KCPRemediationSp
Expect(input.BootstrapClusterProxy).ToNot(BeNil(), "Invalid argument. input.BootstrapClusterProxy can't be nil when calling %s spec", specName)
Expect(os.MkdirAll(input.ArtifactFolder, 0750)).To(Succeed(), "Invalid argument. input.ArtifactFolder can't be created for %s spec", specName)
Expect(input.E2EConfig.Variables).To(HaveKey(KubernetesVersion))
Expect(input.Flavor).ToNot(BeNil(), "Invalid argument. input.Flavor can't be nil when calling %s spec", specName)

// Setup a Namespace where to host objects for this spec and create a watcher for the namespace events.
namespace, cancelWatches = setupSpecNamespace(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder)
Expand All @@ -117,7 +116,7 @@ func KCPRemediationSpec(ctx context.Context, inputGetter func() KCPRemediationSp
Proxy: input.BootstrapClusterProxy,
ArtifactFolder: input.ArtifactFolder,
SpecName: specName,
Flavor: input.Flavor,
Flavor: pointer.StringDeref(input.Flavor, "kcp-remediation"),

// values to be injected in the template

Expand Down Expand Up @@ -432,7 +431,7 @@ type createWorkloadClusterAndWaitInput struct {
Proxy framework.ClusterProxy
ArtifactFolder string
SpecName string
Flavor *string
Flavor string
Namespace string
AuthenticationToken string
ServerAddr string
Expand All @@ -455,7 +454,7 @@ func createWorkloadClusterAndWait(ctx context.Context, input createWorkloadClust
KubeconfigPath: input.Proxy.GetKubeconfigPath(),

// select template
Flavor: pointer.StringDeref(input.Flavor, "kcp-remediation"),
Flavor: input.Flavor,
// define template variables
Namespace: input.Namespace,
ClusterName: clusterName,
Expand Down

0 comments on commit 8bb56d0

Please sign in to comment.