diff --git a/controlplane/kubeadm/api/v1alpha4/conversion.go b/controlplane/kubeadm/api/v1alpha4/conversion.go index 9da266d9213d..1637b272a82c 100644 --- a/controlplane/kubeadm/api/v1alpha4/conversion.go +++ b/controlplane/kubeadm/api/v1alpha4/conversion.go @@ -180,6 +180,10 @@ func (src *KubeadmControlPlaneTemplate) ConvertTo(dstRaw conversion.Hub) error { dst.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy } + if restored.Spec.Template.Spec.RemediationStrategy != nil { + dst.Spec.Template.Spec.RemediationStrategy = restored.Spec.Template.Spec.RemediationStrategy + } + return nil } diff --git a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook.go b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook.go index dd512fe26967..52a6276ba132 100644 --- a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook.go +++ b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook.go @@ -171,6 +171,8 @@ func (in *KubeadmControlPlane) ValidateUpdate(old runtime.Object) error { {spec, "machineTemplate", "nodeDeletionTimeout"}, {spec, "replicas"}, {spec, "version"}, + {spec, "remediationStrategy"}, + {spec, "remediationStrategy", "*"}, {spec, "rolloutAfter"}, {spec, "rolloutBefore", "*"}, {spec, "rolloutStrategy", "*"}, diff --git a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook_test.go b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook_test.go index 3ead11553c28..360e345ec187 100644 --- a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook_test.go +++ b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_webhook_test.go @@ -407,6 +407,11 @@ func TestKubeadmControlPlaneValidateUpdate(t *testing.T) { validUpdate.Spec.RolloutBefore = &RolloutBefore{ CertificatesExpiryDays: pointer.Int32(14), } + validUpdate.Spec.RemediationStrategy = &RemediationStrategy{ + MaxRetry: pointer.Int32(50), + MinHealthyPeriod: &metav1.Duration{Duration: 10 * time.Hour}, + RetryPeriod: metav1.Duration{Duration: 10 * time.Minute}, + } validUpdate.Spec.KubeadmConfigSpec.Format = bootstrapv1.CloudConfig scaleToZero := before.DeepCopy() diff --git a/controlplane/kubeadm/api/v1beta1/kubeadmcontrolplanetemplate_types.go b/controlplane/kubeadm/api/v1beta1/kubeadmcontrolplanetemplate_types.go index 4817b71b18fb..9fab688e84f7 100644 --- a/controlplane/kubeadm/api/v1beta1/kubeadmcontrolplanetemplate_types.go +++ b/controlplane/kubeadm/api/v1beta1/kubeadmcontrolplanetemplate_types.go @@ -91,6 +91,10 @@ type KubeadmControlPlaneTemplateResourceSpec struct { // +optional // +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}} RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"` + + // The RemediationStrategy that controls how control plane machine remediation happens. + // +optional + RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"` } // KubeadmControlPlaneTemplateMachineTemplate defines the template for Machines diff --git a/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go b/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go index 88d4bca9de22..5d6d56bccd09 100644 --- a/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go +++ b/controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go @@ -324,6 +324,11 @@ func (in *KubeadmControlPlaneTemplateResourceSpec) DeepCopyInto(out *KubeadmCont *out = new(RolloutStrategy) (*in).DeepCopyInto(*out) } + if in.RemediationStrategy != nil { + in, out := &in.RemediationStrategy, &out.RemediationStrategy + *out = new(RemediationStrategy) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeadmControlPlaneTemplateResourceSpec. diff --git a/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanetemplates.yaml b/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanetemplates.yaml index a70a55892c40..92fcc11ab3d7 100644 --- a/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanetemplates.yaml +++ b/controlplane/kubeadm/config/crd/bases/controlplane.cluster.x-k8s.io_kubeadmcontrolplanetemplates.yaml @@ -2382,6 +2382,55 @@ spec: time limitations. type: string type: object + remediationStrategy: + description: The RemediationStrategy that controls how control + plane machine remediation happens. + properties: + maxRetry: + description: "MaxRetry is the Max number of retries while + attempting to remediate an unhealthy machine. A retry + happens when a machine that was created as a replacement + for an unhealthy machine also fails. For example, given + a control plane with three machines M1, M2, M3: \n M1 + become unhealthy; remediation happens, and M1-1 is created + as a replacement. If M1-1 (replacement of M1) has problems + while bootstrapping it will become unhealthy, and then + be remediated; such operation is considered a retry, + remediation-retry #1. If M1-2 (replacement of M1-2) + becomes unhealthy, remediation-retry #2 will happen, + etc. \n A retry could happen only after RetryPeriod + from the previous retry. If a machine is marked as unhealthy + after MinHealthyPeriod from the previous remediation + expired, this is not considered a retry anymore because + the new issue is assumed unrelated from the previous + one. \n If not set, the remedation will be retried infinitely." + format: int32 + type: integer + minHealthyPeriod: + description: "MinHealthyPeriod defines the duration after + which KCP will consider any failure to a machine unrelated + from the previous one. In this case the remediation + is not considered a retry anymore, and thus the retry + counter restarts from 0. For example, assuming MinHealthyPeriod + is set to 1h (default) \n M1 become unhealthy; remediation + happens, and M1-1 is created as a replacement. If M1-1 + (replacement of M1) has problems within the 1hr after + the creation, also this machine will be remediated and + this operation is considered a retry - a problem related + to the original issue happened to M1 -. \n If instead + the problem on M1-1 is happening after MinHealthyPeriod + expired, e.g. four days after m1-1 has been created + as a remediation of M1, the problem on M1-1 is considered + unrelated to the original issue happened to M1. \n If + not set, this value is defaulted to 1h." + type: string + retryPeriod: + description: "RetryPeriod is the duration that KCP should + wait before remediating a machine being created as a + replacement for an unhealthy machine (a retry). \n If + not set, a retry will happen immediately." + type: string + type: object rolloutAfter: description: RolloutAfter is a field to indicate a rollout should be performed after the specified time even if no diff --git a/controlplane/kubeadm/internal/controllers/helpers.go b/controlplane/kubeadm/internal/controllers/helpers.go index f6ef0dc13b48..a94370537cd2 100644 --- a/controlplane/kubeadm/internal/controllers/helpers.go +++ b/controlplane/kubeadm/internal/controllers/helpers.go @@ -341,7 +341,7 @@ func (r *KubeadmControlPlaneReconciler) generateMachine(ctx context.Context, kcp } // Remove the annotation tracking that a remediation is in progress (the remediation completed when - // the replacement machine have been created above). + // the replacement machine has been created above). delete(kcp.Annotations, controlplanev1.RemediationInProgressAnnotation) return nil diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index c0a6b6f9479d..72a0eb8236aa 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -73,11 +73,6 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C return ctrl.Result{}, kerrors.NewAggregate(errList) } - // Returns if another remediation is in progress but the new machine is not yet created. - if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok { - return ctrl.Result{}, nil - } - // Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine) // and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation. unhealthyMachines := controlPlane.UnhealthyMachines() @@ -99,6 +94,16 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C return ctrl.Result{}, nil } + log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized) + + // Returns if another remediation is in progress but the new Machine is not yet created. + // Note: This condition is checked after we check for unhealthy Machines and if machineToBeRemediated + // is being deleted to avoid unnecessary logs if no further remediation should be done. + if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok { + log.Info("Another remediation is already in progress. Skipping remediation.") + return ctrl.Result{}, nil + } + patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client) if err != nil { return ctrl.Result{}, err @@ -118,7 +123,6 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C // Before starting remediation, run preflight checks in order to verify it is safe to remediate. // If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition. - log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized) // Check if KCP is allowed to remediate considering retry limits: // - Remediation cannot happen because retryPeriod is not yet expired. @@ -132,9 +136,11 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C return ctrl.Result{}, nil } - // Executes checks that applies only if the control plane is already initialized; in this case KCP can - // remediate only if it can safely assume that the operation preserves the operation state of the existing cluster (or at least it doesn't make it worst). if controlPlane.KCP.Status.Initialized { + // Executes checks that apply only if the control plane is already initialized; in this case KCP can + // remediate only if it can safely assume that the operation preserves the operation state of the + // existing cluster (or at least it doesn't make it worse). + // The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance. if controlPlane.Machines.Len() <= 1 { log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len()) @@ -163,22 +169,14 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C return ctrl.Result{}, nil } } - } - - // Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation. - remediationInProgressValue, err := remediationInProgressData.Marshal() - if err != nil { - return ctrl.Result{}, err - } - // Start remediating the unhealthy control plane machine by deleting it. - // A new machine will come up completing the operation as part of the regular + // Start remediating the unhealthy control plane machine by deleting it. + // A new machine will come up completing the operation as part of the regular reconcile. - // If the control plane is initialized, before deleting the machine: - // - if the machine hosts the etcd leader, forward etcd leadership to another machine. - // - delete the etcd member hosted on the machine being deleted. - // - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0) - if controlPlane.KCP.Status.Initialized { + // If the control plane is initialized, before deleting the machine: + // - if the machine hosts the etcd leader, forward etcd leadership to another machine. + // - delete the etcd member hosted on the machine being deleted. + // - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0) workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster)) if err != nil { log.Error(err, "Failed to create client to workload cluster") @@ -227,6 +225,12 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C log.Info("Remediating unhealthy machine") conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") + // Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation. + remediationInProgressValue, err := remediationInProgressData.Marshal() + if err != nil { + return ctrl.Result{}, err + } + // Set annotations tracking remediation details so they can be picked up by the machine // that will be created as part of the scale up action that completes the remediation. annotations.AddAnnotations(controlPlane.KCP, map[string]string{ @@ -263,49 +267,39 @@ func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machin return remediationInProgressData, true, nil } - // Gets MinHealthySeconds and RetryDelaySeconds from the remediation strategy, or use defaults. + // Gets MinHealthyPeriod and RetryPeriod from the remediation strategy, or use defaults. minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod != nil { minHealthyPeriod = controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration } - retryPeriod := time.Duration(0) if controlPlane.KCP.Spec.RemediationStrategy != nil { retryPeriod = controlPlane.KCP.Spec.RemediationStrategy.RetryPeriod.Duration } // Gets the timestamp of the last remediation; if missing, default to a value - // that ensures both MinHealthySeconds and RetryDelaySeconds are expired. + // that ensures both MinHealthyPeriod and RetryPeriod are expired. // NOTE: this could potentially lead to executing more retries than expected or to executing retries before than // expected, but this is considered acceptable when the system recovers from someone/something changes or deletes // the RemediationForAnnotation on Machines. - max := func(x, y time.Duration) time.Duration { - if x < y { - return y - } - return x - } - lastRemediationTime := reconciliationTime.Add(-2 * max(minHealthyPeriod, retryPeriod)) if !lastRemediationData.Timestamp.IsZero() { lastRemediationTime = lastRemediationData.Timestamp.Time } - // Check if the machine being remediated has been created as a remediation for a previous unhealthy machine. - // NOTE: if someone/something changes or deletes the RemediationForAnnotation on Machines, this could potentially - // lead to executing more retries than expected, but this is considered acceptable in such a case. - machineRemediationFor := remediationInProgressData.Machine - if lastRemediationData.Machine != "" { - // If the remediation is happening before minHealthyPeriod is expired, then KCP considers this - // as a remediation for the same previously unhealthy machine. - if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) { - machineRemediationFor = lastRemediationData.Machine - log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, machineRemediationFor)) - } + // Once we get here we already know that there was a last remediation for the Machine. + // If the current remediation is happening before minHealthyPeriod is expired, then KCP considers this + // as a remediation for the same previously unhealthy machine. + // NOTE: If someone/something changes the RemediationForAnnotation on Machines (e.g. changes the Timestamp), + // this could potentially lead to executing more retries than expected, but this is considered acceptable in such a case. + var retryForSameMachineInProgress bool + if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) { + retryForSameMachineInProgress = true + log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, lastRemediationData.Machine)) } - // If remediation is happening for a different machine, this is the first try of a new retry sequence. - if lastRemediationData.Machine != machineRemediationFor { + // If the retry for the same machine is not in progress, this is the first try of a new retry sequence. + if !retryForSameMachineInProgress { return remediationInProgressData, true, nil } @@ -315,7 +309,7 @@ func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machin // Check if remediation can happen because retryPeriod is passed. if lastRemediationTime.Add(retryPeriod).After(reconciliationTime) { log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation", retryPeriod)) - conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryDelay)", retryPeriod) + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryPeriod)", retryPeriod) return remediationInProgressData, false, nil } @@ -329,12 +323,20 @@ func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machin } } - // All the check passed, increase the remediation number. + // All the check passed, increase the remediation retry count. remediationInProgressData.RetryCount++ return remediationInProgressData, true, nil } +// max calculates the maximum duration. +func max(x, y time.Duration) time.Duration { + if x < y { + return y + } + return x +} + // canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated // without loosing etcd quorum. // diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go index 3766cbacbf56..26183baa1cc7 100644 --- a/controlplane/kubeadm/internal/controllers/remediation_test.go +++ b/controlplane/kubeadm/internal/controllers/remediation_test.go @@ -106,7 +106,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { t.Run("reconcileUnhealthyMachines return early if another remediation is in progress", func(t *testing.T) { g := NewWithT(t) - m := getDeletingMachine(ns.Name, "m1-unhealthy-deleting-", withMachineHealthCheckFailed()) + m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation()) conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "") conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") controlPlane := &internal.ControlPlane{ @@ -142,6 +142,8 @@ func TestReconcileUnhealthyMachines(t *testing.T) { } ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) + g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) + g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped g.Expect(err).ToNot(HaveOccurred()) }) @@ -309,7 +311,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { removeFinalizer(g, m1) g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed()) }) - t.Run("Remediation does not happen if RetryDelay is not yet passed", func(t *testing.T) { + t.Run("Remediation does not happen if RetryPeriod is not yet passed", func(t *testing.T) { g := NewWithT(t) m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{ @@ -327,7 +329,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Version: "v1.19.1", RemediationStrategy: &controlplanev1.RemediationStrategy{ MaxRetry: utilpointer.Int32(3), - RetryPeriod: metav1.Duration{Duration: controlplanev1.DefaultMinHealthyPeriod}, // RetryDelaySeconds not yet expired. + RetryPeriod: metav1.Duration{Duration: controlplanev1.DefaultMinHealthyPeriod}, // RetryPeriod not yet expired. }, }, }, @@ -352,7 +354,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation)) - assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest 1h0m0s (RetryDelay)") + assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest 1h0m0s (RetryPeriod)") err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1) g.Expect(err).ToNot(HaveOccurred()) @@ -401,7 +403,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { g.Expect(env.Cleanup(ctx, m)).To(Succeed()) }) - t.Run("Remediation does not happen if there is a deleting machine", func(t *testing.T) { + t.Run("Remediation does not happen if there is another machine being deleted (not the one to be remediated)", func(t *testing.T) { g := NewWithT(t) m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) @@ -1146,7 +1148,7 @@ func TestReconcileUnhealthyMachinesSequences(t *testing.T) { err = env.Get(ctx, client.ObjectKey{Namespace: m3.Namespace, Name: m3.Name}, m3) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) + g.Expect(m3.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) removeFinalizer(g, m3) g.Expect(env.Cleanup(ctx, m3)).To(Succeed()) diff --git a/docs/book/src/reference/labels_and_annotations.md b/docs/book/src/reference/labels_and_annotations.md index be03817863d3..5a6e5089bf24 100644 --- a/docs/book/src/reference/labels_and_annotations.md +++ b/docs/book/src/reference/labels_and_annotations.md @@ -49,5 +49,5 @@ | controlplane.cluster.x-k8s.io/skip-coredns | It explicitly skips reconciling CoreDNS if set. | | controlplane.cluster.x-k8s.io/skip-kube-proxy | It explicitly skips reconciling kube-proxy if set. | | controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration | It is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration. This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP. | -| controlplane.cluster.x-k8s.io/remediation-in-progress | It is a KCP remediation is that tracks that the system is in between having deleted an unhealthy machine and recreating its replacement. | +| controlplane.cluster.x-k8s.io/remediation-in-progress | It is a KCP annotation that tracks that the system is in between having deleted an unhealthy machine and recreating its replacement. | | controlplane.cluster.x-k8s.io/remediation-for | It is a machine annotation that links a new machine to the unhealthy machine it is replacing. | diff --git a/test/e2e/kcp_remediations.go b/test/e2e/kcp_remediations.go index c0e69cd0a0bc..64fc229b0b59 100644 --- a/test/e2e/kcp_remediations.go +++ b/test/e2e/kcp_remediations.go @@ -19,16 +19,13 @@ package e2e import ( "context" "fmt" - "io" "os" - "os/exec" "path/filepath" - "runtime" - "strings" "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + authenticationv1 "k8s.io/api/authentication/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -117,7 +114,7 @@ func KCPRemediationSpec(ctx context.Context, inputGetter func() KCPRemediationSp Proxy: input.BootstrapClusterProxy, ArtifactFolder: input.ArtifactFolder, SpecName: specName, - Flavor: input.Flavor, + Flavor: pointer.StringDeref(input.Flavor, "kcp-remediation"), // values to be injected in the template @@ -126,7 +123,7 @@ func KCPRemediationSpec(ctx context.Context, inputGetter func() KCPRemediationSp // NOTE: this func also setups credentials/RBAC rules and everything necessary to get the authenticationToken. AuthenticationToken: getAuthenticationToken(ctx, input.BootstrapClusterProxy, namespace.Name), // Address to be used for accessing the management cluster from a workload cluster. - ServerAddr: getServerAddr(input.BootstrapClusterProxy.GetKubeconfigPath()), + ServerAddr: getServerAddr(ctx, input.BootstrapClusterProxy), }) // The first CP machine comes up but it does not complete bootstrap @@ -235,7 +232,7 @@ func KCPRemediationSpec(ctx context.Context, inputGetter func() KCPRemediationSp Expect(secondMachine.Status.NodeRef).To(BeNil()) log.Logf("Machine %s is up but still bootstrapping", secondMachineName) - // Intentionally trigger remediation on the second CP, and validate and validate also this one is deleted and a replacement should come up. + // Intentionally trigger remediation on the second CP and validate that also this one is deleted and a replacement should come up. By("REMEDIATING SECOND CONTROL PLANE MACHINE") @@ -432,17 +429,15 @@ type createWorkloadClusterAndWaitInput struct { Proxy framework.ClusterProxy ArtifactFolder string SpecName string - Flavor *string + Flavor string Namespace string - AuthenticationToken []byte + AuthenticationToken string ServerAddr string } -// createWorkloadClusterAndWait creates a workload cluster ard return as soon as the cluster infrastructure is ready. +// createWorkloadClusterAndWait creates a workload cluster and return as soon as the cluster infrastructure is ready. // NOTE: we are not using the same func used by other tests because it would fail if the control plane doesn't come up, -// -// which instead is expected in this case. -// +// which instead is expected in this case. // NOTE: clusterResources is filled only partially. func createWorkloadClusterAndWait(ctx context.Context, input createWorkloadClusterAndWaitInput) (clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult) { clusterResources = new(clusterctl.ApplyClusterTemplateAndWaitResult) @@ -457,7 +452,7 @@ func createWorkloadClusterAndWait(ctx context.Context, input createWorkloadClust KubeconfigPath: input.Proxy.GetKubeconfigPath(), // select template - Flavor: pointer.StringDeref(input.Flavor, "kcp-remediation"), + Flavor: input.Flavor, // define template variables Namespace: input.Namespace, ClusterName: clusterName, @@ -469,7 +464,7 @@ func createWorkloadClusterAndWait(ctx context.Context, input createWorkloadClust LogFolder: filepath.Join(input.ArtifactFolder, "clusters", input.Proxy.GetName()), // Adds authenticationToken, server address and namespace variables to be injected in the cluster template. ClusterctlVariables: map[string]string{ - "TOKEN": string(input.AuthenticationToken), + "TOKEN": input.AuthenticationToken, "SERVER": input.ServerAddr, "NAMESPACE": input.Namespace, }, @@ -560,11 +555,10 @@ func waitForMachines(ctx context.Context, input waitForMachinesInput) (allMachin // Waits for the desired set of machines to exist. log.Logf("Waiting for %d machines, must have %s, must not have %s", input.ExpectedReplicas, expectedOldMachines.UnsortedList(), expectedDeletedMachines.UnsortedList()) - Eventually(func() bool { + Eventually(func(g Gomega) { // Gets the list of machines - if err := input.Lister.List(ctx, machineList, inClustersNamespaceListOption, matchClusterListOption); err != nil { - return false - } + g.Expect(input.Lister.List(ctx, machineList, inClustersNamespaceListOption, matchClusterListOption)).To(Succeed()) + allMachines = sets.Set[string]{} for i := range machineList.Items { allMachines.Insert(machineList.Items[i].Name) @@ -578,17 +572,17 @@ func waitForMachines(ctx context.Context, input waitForMachinesInput) (allMachin log.Logf(" - expected %d, got %d: %s, of which new %s, must have check: %t, must not have check: %t", input.ExpectedReplicas, allMachines.Len(), allMachines.UnsortedList(), newMachines.UnsortedList(), allMachines.HasAll(expectedOldMachines.UnsortedList()...), !allMachines.HasAny(expectedDeletedMachines.UnsortedList()...)) // Ensures all the expected old machines are still there. - if !allMachines.HasAll(expectedOldMachines.UnsortedList()...) { - return false - } + g.Expect(allMachines.HasAll(expectedOldMachines.UnsortedList()...)).To(BeTrue(), + "Got machines: %s, must contain all of: %s", allMachines.UnsortedList(), expectedOldMachines.UnsortedList()) // Ensures none of the machines to be deleted is still there. - if allMachines.HasAny(expectedDeletedMachines.UnsortedList()...) { - return false - } + g.Expect(!allMachines.HasAny(expectedDeletedMachines.UnsortedList()...)).To(BeTrue(), + "Got machines: %s, must not contain any of: %s", allMachines.UnsortedList(), expectedDeletedMachines.UnsortedList()) - return allMachines.Len() == input.ExpectedReplicas - }, input.WaitForMachinesIntervals...).Should(BeTrue(), "Failed to get the expected list of machines: got %s (expected %d machines, must have %s, must not have %s)", allMachines.UnsortedList(), input.ExpectedReplicas, expectedOldMachines.UnsortedList(), expectedDeletedMachines.UnsortedList()) + g.Expect(allMachines).To(HaveLen(input.ExpectedReplicas), "Got %d machines, must be %d", len(allMachines), input.ExpectedReplicas) + }, input.WaitForMachinesIntervals...).Should(Succeed(), + "Failed to get the expected list of machines: got %s (expected %d machines, must have %s, must not have %s)", + allMachines.UnsortedList(), input.ExpectedReplicas, expectedOldMachines.UnsortedList(), expectedDeletedMachines.UnsortedList()) log.Logf("Got %d machines: %s", allMachines.Len(), allMachines.UnsortedList()) // Ensures the desired set of machines is stable (no further machines are created or deleted). @@ -611,27 +605,31 @@ func waitForMachines(ctx context.Context, input waitForMachinesInput) (allMachin } // getServerAddr returns the address to be used for accessing the management cluster from a workload cluster. -func getServerAddr(kubeconfigPath string) string { - kubeConfig, err := clientcmd.LoadFromFile(kubeconfigPath) - Expect(err).ToNot(HaveOccurred(), "failed to load management cluster's kubeconfig file") +func getServerAddr(ctx context.Context, clusterProxy framework.ClusterProxy) string { + // With CAPD, we can't just access the bootstrap cluster via 127.0.0.1: from the + // workload cluster. Instead we retrieve the server name from the cluster-info ConfigMap in the bootstrap + // cluster (e.g. "https://test-z45p9k-control-plane:6443") + // Note: This has been tested with MacOS,Linux and Prow. + clusterInfoCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cluster-info", + Namespace: metav1.NamespacePublic, + }, + } + Expect(clusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(clusterInfoCM), clusterInfoCM)).To(Succeed()) + Expect(clusterInfoCM.Data).To(HaveKey("kubeconfig")) - clusterName := kubeConfig.Contexts[kubeConfig.CurrentContext].Cluster - Expect(clusterName).ToNot(BeEmpty(), "failed to identify current cluster name in management cluster's kubeconfig file") + kubeConfigString := clusterInfoCM.Data["kubeconfig"] - serverAddr := kubeConfig.Clusters[clusterName].Server - Expect(serverAddr).ToNot(BeEmpty(), "failed to identify current server address in management cluster's kubeconfig file") + kubeConfig, err := clientcmd.Load([]byte(kubeConfigString)) + Expect(err).ToNot(HaveOccurred()) - // On CAPD, if not running on Linux, we need to use Docker's proxy to connect back to the host - // to the CAPD cluster. Moby on Linux doesn't use the host.docker.internal DNS name. - if runtime.GOOS != "linux" { - serverAddr = strings.ReplaceAll(serverAddr, "127.0.0.1", "host.docker.internal") - } - return serverAddr + return kubeConfig.Clusters[""].Server } // getAuthenticationToken returns a bearer authenticationToken with minimal RBAC permissions to access the mhc-test ConfigMap that will be used // to control machines bootstrap during the remediation tests. -func getAuthenticationToken(ctx context.Context, managementClusterProxy framework.ClusterProxy, namespace string) []byte { +func getAuthenticationToken(ctx context.Context, managementClusterProxy framework.ClusterProxy, namespace string) string { sa := &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ Name: "mhc-test", @@ -677,21 +675,13 @@ func getAuthenticationToken(ctx context.Context, managementClusterProxy framewor } Expect(managementClusterProxy.GetClient().Create(ctx, roleBinding)).To(Succeed(), "failed to create mhc-test role binding") - cmd := exec.CommandContext(ctx, "kubectl", fmt.Sprintf("--kubeconfig=%s", managementClusterProxy.GetKubeconfigPath()), fmt.Sprintf("--namespace=%s", namespace), "create", "token", "mhc-test") //nolint:gosec - stdout, err := cmd.StdoutPipe() - Expect(err).ToNot(HaveOccurred(), "failed to get stdout for kubectl create authenticationToken") - stderr, err := cmd.StderrPipe() - Expect(err).ToNot(HaveOccurred(), "failed to get stderr for kubectl create authenticationToken") - - Expect(cmd.Start()).To(Succeed(), "failed to run kubectl create authenticationToken") - - output, err := io.ReadAll(stdout) - Expect(err).ToNot(HaveOccurred(), "failed to read stdout from kubectl create authenticationToken") - errout, err := io.ReadAll(stderr) - Expect(err).ToNot(HaveOccurred(), "failed to read stderr from kubectl create authenticationToken") - - Expect(cmd.Wait()).To(Succeed(), "failed to wait kubectl create authenticationToken") - Expect(errout).To(BeEmpty()) + tokenRequest := &authenticationv1.TokenRequest{ + Spec: authenticationv1.TokenRequestSpec{ + ExpirationSeconds: pointer.Int64(2 * 60 * 60), // 2 hours. + }, + } + tokenRequest, err := managementClusterProxy.GetClientSet().CoreV1().ServiceAccounts(namespace).CreateToken(ctx, "mhc-test", tokenRequest, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) - return output + return tokenRequest.Status.Token } diff --git a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go index 7dd5737d1197..7920065ae40f 100644 --- a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go +++ b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go @@ -316,7 +316,6 @@ func (r *DockerMachineReconciler) reconcileNormal(ctx context.Context, cluster * for { select { case <-timeoutCtx.Done(): - log.Info("Cancelling Bootstrap due to timeout") return default: updatedDockerMachine := &infrav1.DockerMachine{} @@ -353,13 +352,6 @@ func (r *DockerMachineReconciler) reconcileNormal(ctx context.Context, cluster * return ctrl.Result{RequeueAfter: 5 * time.Second}, nil } - // If the control plane is not yet initialized, there is no API server to contact to get the ProviderID for the Node - // hosted on this machine, so return early. - // NOTE: we are using RequeueAfter with a short interval in order to make test execution time more stable. - if !conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) { - return ctrl.Result{RequeueAfter: 15 * time.Second}, nil - } - // Usually a cloud provider will do this, but there is no docker-cloud provider. // Requeue if there is an error, as this is likely momentary load balancer // state changes during control plane provisioning.