diff --git a/internal/controllers/topology/cluster/structuredmerge/dryrun.go b/internal/controllers/topology/cluster/structuredmerge/dryrun.go index 424b4da4632c..08a3f6bbb34c 100644 --- a/internal/controllers/topology/cluster/structuredmerge/dryrun.go +++ b/internal/controllers/topology/cluster/structuredmerge/dryrun.go @@ -33,11 +33,12 @@ import ( type dryRunSSAPatchInput struct { client client.Client - // originalUnstructured contains the current state of the object + // originalUnstructured contains the current state of the object. + // Note: We will run SSA dry-run on originalUnstructured and modifiedUnstructured and then compare them. originalUnstructured *unstructured.Unstructured - // dryRunUnstructured contains the intended changes to the object and will be used to - // compare to the originalUnstructured object afterwards. - dryRunUnstructured *unstructured.Unstructured + // modifiedUnstructured contains the intended changes to the object. + // Note: We will run SSA dry-run on originalUnstructured and modifiedUnstructured and then compare them. + modifiedUnstructured *unstructured.Unstructured // helperOptions contains the helper options for filtering the intent. helperOptions *HelperOptions } @@ -52,16 +53,46 @@ func dryRunSSAPatch(ctx context.Context, dryRunCtx *dryRunSSAPatchInput) (bool, } // Add TopologyDryRunAnnotation to notify validation webhooks to skip immutability checks. - if err := unstructured.SetNestedField(dryRunCtx.dryRunUnstructured.Object, "", "metadata", "annotations", clusterv1.TopologyDryRunAnnotation); err != nil { - return false, false, errors.Wrap(err, "failed to add topology dry-run annotation") + if err := unstructured.SetNestedField(dryRunCtx.originalUnstructured.Object, "", "metadata", "annotations", clusterv1.TopologyDryRunAnnotation); err != nil { + return false, false, errors.Wrap(err, "failed to add topology dry-run annotation to original object") + } + if err := unstructured.SetNestedField(dryRunCtx.modifiedUnstructured.Object, "", "metadata", "annotations", clusterv1.TopologyDryRunAnnotation); err != nil { + return false, false, errors.Wrap(err, "failed to add topology dry-run annotation to modified object") } - // Do a server-side apply dry-run request to get the updated object. - err := dryRunCtx.client.Patch(ctx, dryRunCtx.dryRunUnstructured, client.Apply, client.DryRunAll, client.FieldOwner(TopologyManagerName), client.ForceOwnership) + // Do a server-side apply dry-run with modifiedUnstructured to get the updated object. + err := dryRunCtx.client.Patch(ctx, dryRunCtx.modifiedUnstructured, client.Apply, client.DryRunAll, client.FieldOwner(TopologyManagerName), client.ForceOwnership) if err != nil { // This catches errors like metadata.uid changes. - return false, false, errors.Wrap(err, "failed to request dry-run server side apply") + return false, false, errors.Wrap(err, "server side apply dry-run failed for modified object") + } + + // Do a server-side apply dry-run with originalUnstructured to ensure the latest defaulting is applied. + // Note: After a Cluster API upgrade there is no guarantee that defaulting has been run on existing objects. + // We have to ensure defaulting has been applied before comparing original with modified, because otherwise + // differences in defaulting would trigger rollouts. + // Note: We cannot use the managed fields of originalUnstructured after SSA dryrun, because applying + // the whole originalUnstructured will give capi-topology ownership of all fields. Thus, we back up the + // managed fields and restore them after the dry run. + // It's fine to compare the managed fields of modifiedUnstructured after dry-run with originalUnstructured + // before dry-run as we want to know if applying modifiedUnstructured would change managed fields on original. + + // Filter object to drop fields which are not part of our intent. + // Note: It's especially important to also drop metadata.resourceVersion, otherwise we could get the following + // error: "the object has been modified; please apply your changes to the latest version and try again" + filterObject(dryRunCtx.originalUnstructured, dryRunHelperOptions) + // Backup managed fields. + originalUnstructuredManagedFieldsBeforeSSA := dryRunCtx.originalUnstructured.GetManagedFields() + // Set managed fields to nil. + // Note: Otherwise we would get the following error: + // "failed to request dry-run server side apply: metadata.managedFields must be nil" + dryRunCtx.originalUnstructured.SetManagedFields(nil) + err = dryRunCtx.client.Patch(ctx, dryRunCtx.originalUnstructured, client.Apply, client.DryRunAll, client.FieldOwner(TopologyManagerName), client.ForceOwnership) + if err != nil { + return false, false, errors.Wrap(err, "server side apply dry-run failed for original object") } + // Restore managed fields. + dryRunCtx.originalUnstructured.SetManagedFields(originalUnstructuredManagedFieldsBeforeSSA) // Cleanup the dryRunUnstructured object to remove the added TopologyDryRunAnnotation // and remove the affected managedFields for `manager=capi-topology` which would @@ -72,8 +103,8 @@ func dryRunSSAPatch(ctx context.Context, dryRunCtx *dryRunSSAPatchInput) (bool, // changes to the object. // Please note that if other managers made changes to fields that we care about and thus ownership changed, // this would affect our managed fields as well and we would still detect it by diffing our managed fields. - if err := cleanupManagedFieldsAndAnnotation(dryRunCtx.dryRunUnstructured); err != nil { - return false, false, errors.Wrap(err, "failed to filter topology dry-run annotation on dryRunUnstructured") + if err := cleanupManagedFieldsAndAnnotation(dryRunCtx.modifiedUnstructured); err != nil { + return false, false, errors.Wrap(err, "failed to filter topology dry-run annotation on modified object") } // Also run the function for the originalUnstructured to remove the managedField @@ -84,11 +115,11 @@ func dryRunSSAPatch(ctx context.Context, dryRunCtx *dryRunSSAPatchInput) (bool, // Please note that if other managers made changes to fields that we care about and thus ownership changed, // this would affect our managed fields as well and we would still detect it by diffing our managed fields. if err := cleanupManagedFieldsAndAnnotation(dryRunCtx.originalUnstructured); err != nil { - return false, false, errors.Wrap(err, "failed to filter topology dry-run annotation on originalUnstructured") + return false, false, errors.Wrap(err, "failed to filter topology dry-run annotation on original object") } // Drop the other fields which are not part of our intent. - filterObject(dryRunCtx.dryRunUnstructured, dryRunHelperOptions) + filterObject(dryRunCtx.modifiedUnstructured, dryRunHelperOptions) filterObject(dryRunCtx.originalUnstructured, dryRunHelperOptions) // Compare the output of dry run to the original object. @@ -96,12 +127,12 @@ func dryRunSSAPatch(ctx context.Context, dryRunCtx *dryRunSSAPatchInput) (bool, if err != nil { return false, false, err } - dryRunJSON, err := json.Marshal(dryRunCtx.dryRunUnstructured) + modifiedJSON, err := json.Marshal(dryRunCtx.modifiedUnstructured) if err != nil { return false, false, err } - rawDiff, err := jsonpatch.CreateMergePatch(originalJSON, dryRunJSON) + rawDiff, err := jsonpatch.CreateMergePatch(originalJSON, modifiedJSON) if err != nil { return false, false, err } @@ -119,7 +150,7 @@ func dryRunSSAPatch(ctx context.Context, dryRunCtx *dryRunSSAPatchInput) (bool, } // cleanupManagedFieldsAndAnnotation adjusts the obj to remove the topology.cluster.x-k8s.io/dry-run -// annotation as well as the field ownership reference in managedFields. It does +// and cluster.x-k8s.io/conversion-data annotations as well as the field ownership reference in managedFields. It does // also remove the timestamp of the managedField for `manager=capi-topology` because // it is expected to change due to the additional annotation. func cleanupManagedFieldsAndAnnotation(obj *unstructured.Unstructured) error { diff --git a/internal/controllers/topology/cluster/structuredmerge/serversidepathhelper.go b/internal/controllers/topology/cluster/structuredmerge/serversidepathhelper.go index a60203c43f62..679f719ec67a 100644 --- a/internal/controllers/topology/cluster/structuredmerge/serversidepathhelper.go +++ b/internal/controllers/topology/cluster/structuredmerge/serversidepathhelper.go @@ -96,7 +96,7 @@ func NewServerSidePatchHelper(ctx context.Context, original, modified client.Obj hasChanges, hasSpecChanges, err = dryRunSSAPatch(ctx, &dryRunSSAPatchInput{ client: c, originalUnstructured: originalUnstructured, - dryRunUnstructured: modifiedUnstructured.DeepCopy(), + modifiedUnstructured: modifiedUnstructured.DeepCopy(), helperOptions: helperOptions, }) if err != nil { diff --git a/internal/controllers/topology/cluster/structuredmerge/serversidepathhelper_test.go b/internal/controllers/topology/cluster/structuredmerge/serversidepathhelper_test.go index 9714baa5d0c9..b05fb2a4cc86 100644 --- a/internal/controllers/topology/cluster/structuredmerge/serversidepathhelper_test.go +++ b/internal/controllers/topology/cluster/structuredmerge/serversidepathhelper_test.go @@ -17,15 +17,30 @@ limitations under the License. package structuredmerge import ( + "context" "encoding/json" + "fmt" + "net" + "os" + "path/filepath" + "strconv" "testing" + "time" . "github.com/onsi/gomega" + admissionv1 "k8s.io/api/admissionregistration/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/webhook" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/internal/test/builder" "sigs.k8s.io/cluster-api/util/patch" ) @@ -506,3 +521,285 @@ func getTopologyManagedFields(original client.Object) map[string]interface{} { } return r } + +// NOTE: This test ensures that ServerSideApply works as expected when new defaulting logic is introduced by a Cluster API update. +func TestServerSideApplyWithDefaulting(t *testing.T) { + g := NewWithT(t) + + // Create a namespace for running the test + ns, err := env.CreateNamespace(ctx, "ssa-defaulting") + g.Expect(err).ToNot(HaveOccurred()) + + // Setup webhook with the manager. + // Note: The webhooks is not active yet, as the MutatingWebhookConfiguration will be deployed later. + mutatingWebhookConfiguration, err := setupWebhookWithManager(ns) + g.Expect(err).ToNot(HaveOccurred()) + + // Calculate KubeadmConfigTemplate. + kct := &bootstrapv1.KubeadmConfigTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kct", + Namespace: ns.Name, + }, + Spec: bootstrapv1.KubeadmConfigTemplateSpec{ + Template: bootstrapv1.KubeadmConfigTemplateResource{ + Spec: bootstrapv1.KubeadmConfigSpec{ + JoinConfiguration: &bootstrapv1.JoinConfiguration{ + NodeRegistration: bootstrapv1.NodeRegistrationOptions{ + KubeletExtraArgs: map[string]string{ + "eviction-hard": "nodefs.available<0%,nodefs.inodesFree<0%,imagefs.available<0%", + }, + }, + }, + }, + }, + }, + } + + // The test does the following. + // 1. Create KubeadmConfigTemplate + // 2. Activate the new defaulting logic via the webhook + // * This simulates the deployment of a new Cluster API version with new defaulting + // 3. Simulate defaulting on original and/or modified + // * defaultOriginal will add a label to the KubeadmConfigTemplate which will trigger defaulting + // * original is the KubeadmConfigTemplate referenced in a MachineDeployment of the Cluster topology + // * defaultModified will simulate that defaulting was run on the KubeadmConfigTemplate referenced in the ClusterClass + // * modified is the desired state calculated based on the KubeadmConfigTemplate referenced in the ClusterClass + // * We are testing through all permutations as we don't want to assume on which objects defaulting was run. + // 4. Check patch helper results + + // We have the following test cases: + // | original | modified | expect behavior | + // | | | no-op | + // | defaulted | | no-op | + // | | defaulted | no spec changes, only take ownership of defaulted fields | + // | defaulted | defaulted | no spec changes, only take ownership of defaulted fields | + tests := []struct { + name string + defaultOriginal bool + defaultModified bool + expectChanges bool + expectSpecChanges bool + expectFieldOwnership bool + }{ + { + name: "no-op if neither is defaulted", + defaultOriginal: false, + defaultModified: false, + // Dry run results: + // * original: field will be defaulted by the webhook, capi-topology doesn't get ownership. + // * modified: field will be defaulted by the webhook, capi-topology doesn't get ownership. + expectChanges: false, + expectSpecChanges: false, + expectFieldOwnership: false, + }, + { + name: "no-op if original is defaulted", + defaultOriginal: true, + defaultModified: false, + // Dry run results: + // * original: no defaulting in dry run, as field has already been defaulted before, capi-topology doesn't get ownership. + // * modified: field will be defaulted by the webhook, capi-topology doesn't get ownership. + expectChanges: false, + expectSpecChanges: false, + expectFieldOwnership: false, + }, + { + name: "no spec changes, only take ownership of defaulted fields if modified is defaulted", + defaultOriginal: false, + defaultModified: true, + // Dry run results: + // * original: field will be defaulted by the webhook, capi-topology doesn't get ownership. + // * original: no defaulting in dry run, as field has already been defaulted before, capi-topology does get ownership as we explicitly set the field. + // => capi-topology takes ownership during Patch + expectChanges: true, + expectSpecChanges: false, + expectFieldOwnership: true, + }, + { + name: "no spec changes, only take ownership of defaulted fields if both are defaulted", + defaultOriginal: true, + defaultModified: true, + // Dry run results: + // * original: no defaulting in dry run, as field has already been defaulted before, capi-topology doesn't get ownership. + // * original: no defaulting in dry run, as field has already been defaulted before, capi-topology does get ownership as we explicitly set the field. + // => capi-topology takes ownership during Patch + expectChanges: true, + expectSpecChanges: false, + expectFieldOwnership: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + // Note: This is necessary because otherwise we could not create the webhook config + // in multiple test runs, because after the first test run it has a resourceVersion set. + mutatingWebhookConfiguration := mutatingWebhookConfiguration.DeepCopy() + + // Create the initial KubeadmConfigTemplate (with the old defaulting logic). + p0, err := NewServerSidePatchHelper(ctx, nil, kct.DeepCopy(), env.GetClient()) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(p0.HasChanges()).To(BeTrue()) + g.Expect(p0.HasSpecChanges()).To(BeTrue()) + g.Expect(p0.Patch(ctx)).To(Succeed()) + defer func() { + g.Expect(env.CleanupAndWait(ctx, kct.DeepCopy())).To(Succeed()) + }() + + // Enable the new defaulting logic (i.e. simulate the Cluster API update). + // The webhook will default the users field to `[{Name: "default-user"}]`. + g.Expect(env.Create(ctx, mutatingWebhookConfiguration)).To(Succeed()) + defer func() { + g.Expect(env.CleanupAndWait(ctx, mutatingWebhookConfiguration)).To(Succeed()) + }() + + // Run defaulting on the KubeadmConfigTemplate (triggered by an "external controller") + if tt.defaultOriginal { + patchKCT := kct.DeepCopy() + if patchKCT.Labels == nil { + patchKCT.Labels = map[string]string{} + } + patchKCT.Labels["trigger"] = "update" + g.Expect(env.Patch(ctx, patchKCT, client.MergeFrom(kct))).To(Succeed()) + // Ensure patchKCT was defaulted. + g.Expect(patchKCT.Spec.Template.Spec.Users).To(Equal([]bootstrapv1.User{{Name: "default-user"}})) + } + // Get original for the update. + original := kct.DeepCopy() + g.Expect(env.Get(ctx, client.ObjectKeyFromObject(original), original)) + + // Calculate modified for the update. + modified := kct.DeepCopy() + // Run defaulting on modified + // Note: We just default the modified / desired locally as we are not simulating + // an entire ClusterClass. Defaulting on the template of the ClusterClass would + // lead to the modified object having the defaults. + if tt.defaultModified { + defaultKubeadmConfigTemplate(modified) + } + + // Apply modified. + p0, err = NewServerSidePatchHelper(ctx, original, modified, env.GetClient()) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(p0.HasChanges()).To(Equal(tt.expectChanges)) + g.Expect(p0.HasSpecChanges()).To(Equal(tt.expectSpecChanges)) + g.Expect(p0.Patch(ctx)).To(Succeed()) + + // Verify field ownership + // Note: It might take a bit for the cache to be up-to-date. + g.Eventually(func(g Gomega) { + got := original.DeepCopy() + g.Expect(env.Get(ctx, client.ObjectKeyFromObject(got), got)) + + // topology controller should express opinions on spec.template.spec. + fieldV1 := getTopologyManagedFields(got) + g.Expect(fieldV1).ToNot(BeEmpty()) + g.Expect(fieldV1).To(HaveKey("f:spec")) + specFieldV1 := fieldV1["f:spec"].(map[string]interface{}) + g.Expect(specFieldV1).ToNot(BeEmpty()) + g.Expect(specFieldV1).To(HaveKey("f:template")) + specTemplateFieldV1 := specFieldV1["f:template"].(map[string]interface{}) + g.Expect(specTemplateFieldV1).ToNot(BeEmpty()) + g.Expect(specTemplateFieldV1).To(HaveKey("f:spec")) + + specTemplateSpecFieldV1 := specTemplateFieldV1["f:spec"].(map[string]interface{}) + if tt.expectFieldOwnership { + // topology controller should express opinions on spec.template.spec.users. + g.Expect(specTemplateSpecFieldV1).To(HaveKey("f:users")) + } else { + // topology controller should not express opinions on spec.template.spec.users. + g.Expect(specTemplateSpecFieldV1).ToNot(HaveKey("f:users")) + } + }, 2*time.Second).Should(Succeed()) + }) + } +} + +// setupWebhookWithManager configures the envtest manager / webhook server to serve the webhook. +// It also calculates and returns the corresponding MutatingWebhookConfiguration. +// Note: To activate the webhook, the MutatingWebhookConfiguration has to be deployed. +func setupWebhookWithManager(ns *corev1.Namespace) (*admissionv1.MutatingWebhookConfiguration, error) { + webhookServer := env.Manager.GetWebhookServer() + + // Calculate webhook host and path. + // Note: This is done the same way as in our envtest package. + webhookPath := fmt.Sprintf("/%s/ssa-defaulting-webhook", ns.Name) + webhookHost := "127.0.0.1" + if host := os.Getenv("CAPI_WEBHOOK_HOSTNAME"); host != "" { + webhookHost = host + } + + // Serve KubeadmConfigTemplateTestDefaulter on the webhook server. + // Note: This should only ever be called once with the same path, otherwise we get a panic. + webhookServer.Register(webhookPath, + admission.WithCustomDefaulter(&bootstrapv1.KubeadmConfigTemplate{}, &KubeadmConfigTemplateTestDefaulter{})) + + // Calculate the MutatingWebhookConfiguration + caBundle, err := os.ReadFile(filepath.Join(webhookServer.CertDir, webhookServer.CertName)) + if err != nil { + return nil, err + } + + sideEffectNone := admissionv1.SideEffectClassNone + webhookConfig := &admissionv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns.Name + "-webhook-config", + }, + Webhooks: []admissionv1.MutatingWebhook{ + { + Name: ns.Name + ".kubeadmconfigtemplate.bootstrap.cluster.x-k8s.io", + ClientConfig: admissionv1.WebhookClientConfig{ + URL: pointer.String(fmt.Sprintf("https://%s%s", net.JoinHostPort(webhookHost, strconv.Itoa(webhookServer.Port)), webhookPath)), + CABundle: caBundle, + }, + Rules: []admissionv1.RuleWithOperations{ + { + Operations: []admissionv1.OperationType{ + admissionv1.Create, + admissionv1.Update, + }, + Rule: admissionv1.Rule{ + APIGroups: []string{bootstrapv1.GroupVersion.Group}, + APIVersions: []string{bootstrapv1.GroupVersion.Version}, + Resources: []string{"kubeadmconfigtemplates"}, + }, + }, + }, + NamespaceSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + corev1.LabelMetadataName: ns.Name, + }, + }, + AdmissionReviewVersions: []string{"v1"}, + SideEffects: &sideEffectNone, + }, + }, + } + return webhookConfig, nil +} + +var _ webhook.CustomDefaulter = &KubeadmConfigTemplateTestDefaulter{} + +type KubeadmConfigTemplateTestDefaulter struct { +} + +func (d KubeadmConfigTemplateTestDefaulter) Default(_ context.Context, obj runtime.Object) error { + kct, ok := obj.(*bootstrapv1.KubeadmConfigTemplate) + if !ok { + return apierrors.NewBadRequest(fmt.Sprintf("expected a Cluster but got a %T", obj)) + } + + defaultKubeadmConfigTemplate(kct) + return nil +} + +func defaultKubeadmConfigTemplate(kct *bootstrapv1.KubeadmConfigTemplate) { + if len(kct.Spec.Template.Spec.Users) == 0 { + kct.Spec.Template.Spec.Users = []bootstrapv1.User{ + { + Name: "default-user", + }, + } + } +} diff --git a/test/e2e/clusterctl_upgrade.go b/test/e2e/clusterctl_upgrade.go index ec489539467e..aee0bf0f5ed8 100644 --- a/test/e2e/clusterctl_upgrade.go +++ b/test/e2e/clusterctl_upgrade.go @@ -435,6 +435,21 @@ func ClusterctlUpgradeSpec(ctx context.Context, inputGetter func() ClusterctlUpg input.PostUpgrade(managementClusterProxy) } + // After the upgrade check that there were no unexpected rollouts. + log.Logf("Verify there are no unexpected rollouts") + Consistently(func() bool { + postUpgradeMachineList := &unstructured.UnstructuredList{} + postUpgradeMachineList.SetGroupVersionKind(clusterv1.GroupVersion.WithKind("MachineList")) + err = managementClusterProxy.GetClient().List( + ctx, + postUpgradeMachineList, + client.InNamespace(testNamespace.Name), + client.MatchingLabels{clusterv1.ClusterNameLabel: workLoadClusterName}, + ) + Expect(err).NotTo(HaveOccurred()) + return matchUnstructuredLists(preUpgradeMachineList, postUpgradeMachineList) + }, "3m", "30s").Should(BeTrue(), "Machines should remain the same after the upgrade") + // After upgrading we are sure the version is the latest version of the API, // so it is possible to use the standard helpers workloadCluster := framework.GetClusterByName(ctx, framework.GetClusterByNameInput{ @@ -443,26 +458,6 @@ func ClusterctlUpgradeSpec(ctx context.Context, inputGetter func() ClusterctlUpg Name: workLoadClusterName, }) - // TODO(sbueringer) Only run this test for non-ClusterClass Clusters. - // Currently the Cluster topology controller triggers a rollout after upgrade. - // We are actively working on fixing it. - if workloadCluster.Spec.Topology == nil { - // After the upgrade check that there were no unexpected rollouts. - log.Logf("Verify there are no unexpected rollouts") - Consistently(func() bool { - postUpgradeMachineList := &unstructured.UnstructuredList{} - postUpgradeMachineList.SetGroupVersionKind(clusterv1.GroupVersion.WithKind("MachineList")) - err = managementClusterProxy.GetClient().List( - ctx, - postUpgradeMachineList, - client.InNamespace(testNamespace.Name), - client.MatchingLabels{clusterv1.ClusterNameLabel: workLoadClusterName}, - ) - Expect(err).NotTo(HaveOccurred()) - return matchUnstructuredLists(preUpgradeMachineList, postUpgradeMachineList) - }, "3m", "30s").Should(BeTrue(), "Machines should remain the same after the upgrade") - } - if workloadCluster.Spec.Topology != nil { // Cluster is using ClusterClass, scale up via topology. framework.ScaleAndWaitMachineDeploymentTopology(ctx, framework.ScaleAndWaitMachineDeploymentTopologyInput{