diff --git a/docs/book/src/topics/managedcluster.md b/docs/book/src/topics/managedcluster.md index 618deac4174..2771c1b433d 100644 --- a/docs/book/src/topics/managedcluster.md +++ b/docs/book/src/topics/managedcluster.md @@ -626,3 +626,57 @@ Some notes about how this works under the hood: - CAPZ will fetch the kubeconfig for the AKS cluster and store it in a secret named `${CLUSTER_NAME}-kubeconfig` in the management cluster. That secret is then used for discovery by the `KubeadmConfig` resource. - You can customize the `MachinePool`, `AzureMachinePool`, and `KubeadmConfig` resources to your liking. The example above is just a starting point. Note that the key configurations to keep are in the `KubeadmConfig` resource, namely the `files`, `joinConfiguration`, and `preKubeadmCommands` sections. - The `KubeadmConfig` resource will be used to generate a `kubeadm join` command that will be executed on each node in the VMSS. It uses the cluster kubeconfig for discovery. The `kubeadm init phase upload-config all` is run as a preKubeadmCommand to ensure that the kubeadm and kubelet configurations are uploaded to a ConfigMap. This step would normally be done by the `kubeadm init` command, but since we're not running `kubeadm init` we need to do it manually. + +## Adopting Existing AKS Clusters + + + +CAPZ can adopt some AKS clusters created by other means under its management. This works by crafting CAPI and +CAPZ manifests which describe the existing cluster and creating those resources on the CAPI management +cluster. This approach is limited to clusters which can be described by the CAPZ API, which includes the +following constraints: + +- the cluster operates within a single Virtual Network and Subnet +- the cluster's Virtual Network exists outside of the AKS-managed `MC_*` resource group +- the cluster's Virtual Network and Subnet are not shared with any other resources outside the context of this cluster + +To ensure CAPZ does not introduce any unwarranted changes while adopting an existing cluster, carefully review +the [entire AzureManagedControlPlane spec](/reference/v1beta1-api#infrastructure.cluster.x-k8s.io/v1beta1.AzureManagedControlPlaneSpec) +and specify _every_ field in the CAPZ resource. CAPZ's webhooks apply defaults to many fields which may not +match the existing cluster. + +Specific AKS features not represented in the CAPZ API, like those from a newer AKS API version than CAPZ uses, +do not need to be specified in the CAPZ resources to remain configured the way they are. CAPZ will still not +be able to manage that configuration, but it will not modify any settings beyond those for which it has +knowledge. + +By default, CAPZ will not make any changes to or delete any pre-existing Resource Group, Virtual Network, or +Subnet resources. To opt-in to CAPZ management for those clusters, tag those resources with the following +before creating the CAPZ resources: `sigs.k8s.io_cluster-api-provider-azure_cluster_: owned`. +Managed Cluster and Agent Pool resources do not need this tag in order to be adopted. + +After applying the CAPI and CAPZ resources for the cluster, other means of managing the cluster should be +disabled, or at least carefully monitored, to avoid ongoing conflicts with CAPZ's reconciliation process. + +### Pitfalls + +The following describes some specific pieces of configuration that deserve particularly careful attention, +adapted from https://gist.github.com/mtougeron/1e5d7a30df396cd4728a26b2555e0ef0#file-capz-md. + +- Make sure `AzureManagedControlPlane.metadata.name` matches the AKS cluster name +- Set the `AzureManagedControlPlane.spec.virtualNetwork` fields to match your existing VNET +- Make sure the `AzureManagedControlPlane.spec.sshPublicKey` matches what was set on the AKS cluster. (including any potential newlines included in the base64 encoding; this was a big gotcha for me) + - NOTE: This is a required field in CAPZ, if you don't know what public key was used, you can _change_ or _set_ it via the azure cli however before attempting to import the cluster. +- Make sure the `Cluster.spec.clusterNetwork` settings match properly to what you are using in AKS +- Make sure the `AzureManagedControlPlane.spec.dnsServiceIP` matches what is set in AKS +- Set the tag `sigs.k8s.io_cluster-api-provider-azure_cluster_` = `owned` on the AKS cluster +- Set the tag `sigs.k8s.io_cluster-api-provider-azure_role` = `common` on the AKS cluster + +NOTE: Several fields, like networkPlugin, when not set on the AKS cluster at creation time, will mean that CAPZ will not be able to set it when doing a reconcile loop because AKS doesn't allow it to be changed if not set at creation. If it was set at creation time, CAPZ will be able to successfully change/manage the field diff --git a/test/e2e/aks_adopt.go b/test/e2e/aks_adopt.go new file mode 100644 index 00000000000..dd2f9cb231c --- /dev/null +++ b/test/e2e/aks_adopt.go @@ -0,0 +1,154 @@ +//go:build e2e +// +build e2e + +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + + . "github.com/onsi/gomega" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + clusterctlv1 "sigs.k8s.io/cluster-api/cmd/clusterctl/api/v1alpha3" + expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" + "sigs.k8s.io/cluster-api/test/framework/clusterctl" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type AKSAdoptSpecInput struct { + ApplyInput clusterctl.ApplyClusterTemplateAndWaitInput + ApplyResult *clusterctl.ApplyClusterTemplateAndWaitResult + Cluster *clusterv1.Cluster + MachinePools []*expv1.MachinePool +} + +// AKSAdoptSpec tests adopting an existing AKS cluster into management by CAPZ. It first relies on a CAPZ AKS +// cluster having already been created. Then, it will orphan that cluster such that the CAPI and CAPZ +// resources are deleted but the Azure resources remain. Finally, it applies the cluster template again and +// waits for the cluster to become ready. +func AKSAdoptSpec(ctx context.Context, inputGetter func() AKSAdoptSpecInput) { + input := inputGetter() + + mgmtClient := bootstrapClusterProxy.GetClient() + Expect(mgmtClient).NotTo(BeNil()) + + updateResource := []any{"30s", "5s"} + + waitForNoBlockMove := func(obj client.Object) { + waitForBlockMoveGone := []any{"30s", "5s"} + Eventually(func(g Gomega) { + err := mgmtClient.Get(ctx, client.ObjectKeyFromObject(obj), obj) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(obj.GetAnnotations()).NotTo(HaveKey(clusterctlv1.BlockMoveAnnotation)) + }, waitForBlockMoveGone...).Should(Succeed()) + } + + removeFinalizers := func(obj client.Object) { + Eventually(func(g Gomega) { + err := mgmtClient.Get(ctx, client.ObjectKeyFromObject(obj), obj) + g.Expect(err).NotTo(HaveOccurred()) + obj.SetFinalizers([]string{}) + err = mgmtClient.Update(ctx, obj) + g.Expect(err).NotTo(HaveOccurred()) + }, updateResource...).Should(Succeed()) + } + + waitForImmediateDelete := []any{"30s", "5s"} + beginDelete := func(obj client.Object) { + Eventually(func(g Gomega) { + err := mgmtClient.Delete(ctx, obj) + g.Expect(err).NotTo(HaveOccurred()) + }, updateResource...).Should(Succeed()) + } + shouldNotExist := func(obj client.Object) { + waitForGone := []any{"30s", "5s"} + Eventually(func(g Gomega) { + err := mgmtClient.Get(ctx, client.ObjectKeyFromObject(obj), obj) + g.Expect(apierrors.IsNotFound(err)).To(BeTrue()) + }, waitForGone...).Should(Succeed()) + } + deleteAndWait := func(obj client.Object) { + Eventually(func(g Gomega) { + err := mgmtClient.Delete(ctx, obj) + g.Expect(apierrors.IsNotFound(err)).To(BeTrue()) + }, waitForImmediateDelete...).Should(Succeed()) + } + + cluster := input.Cluster + Eventually(func(g Gomega) { + err := mgmtClient.Get(ctx, client.ObjectKeyFromObject(cluster), cluster) + g.Expect(err).NotTo(HaveOccurred()) + cluster.Spec.Paused = true + err = mgmtClient.Update(ctx, cluster) + g.Expect(err).NotTo(HaveOccurred()) + }, updateResource...).Should(Succeed()) + + // wait for the pause to take effect before deleting anything + amcp := &infrav1.AzureManagedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: cluster.Spec.ControlPlaneRef.Namespace, + Name: cluster.Spec.ControlPlaneRef.Name, + }, + } + waitForNoBlockMove(amcp) + for _, mp := range input.MachinePools { + ammp := &infrav1.AzureManagedMachinePool{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace, + Name: mp.Spec.Template.Spec.InfrastructureRef.Name, + }, + } + waitForNoBlockMove(ammp) + } + + beginDelete(cluster) + + for _, mp := range input.MachinePools { + beginDelete(mp) + + ammp := &infrav1.AzureManagedMachinePool{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: mp.Spec.Template.Spec.InfrastructureRef.Namespace, + Name: mp.Spec.Template.Spec.InfrastructureRef.Name, + }, + } + removeFinalizers(ammp) + deleteAndWait(ammp) + + removeFinalizers(mp) + shouldNotExist(mp) + } + + removeFinalizers(amcp) + deleteAndWait(amcp) + // AzureManagedCluster never gets a finalizer + deleteAndWait(&infrav1.AzureManagedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: cluster.Spec.InfrastructureRef.Namespace, + Name: cluster.Spec.InfrastructureRef.Name, + }, + }) + + removeFinalizers(cluster) + shouldNotExist(cluster) + + clusterctl.ApplyClusterTemplateAndWait(ctx, input.ApplyInput, input.ApplyResult) +} diff --git a/test/e2e/azure_test.go b/test/e2e/azure_test.go index bc3d79bd71e..29ae75dd50f 100644 --- a/test/e2e/azure_test.go +++ b/test/e2e/azure_test.go @@ -699,7 +699,7 @@ var _ = Describe("Workload cluster creation", func() { Byf("Upgrading to k8s version %s", kubernetesVersion) Expect(err).NotTo(HaveOccurred()) - clusterctl.ApplyClusterTemplateAndWait(ctx, createApplyClusterTemplateInput( + clusterTemplate := createApplyClusterTemplateInput( specName, withFlavor("aks"), withAzureCNIv1Manifest(e2eConfig.GetVariable(AzureCNIv1Manifest)), @@ -714,7 +714,22 @@ var _ = Describe("Workload cluster creation", func() { WaitForControlPlaneInitialized: WaitForAKSControlPlaneInitialized, WaitForControlPlaneMachinesReady: WaitForAKSControlPlaneReady, }), - ), result) + ) + + clusterctl.ApplyClusterTemplateAndWait(ctx, clusterTemplate, result) + + // This test should be first to make sure that the template re-applied here matches the current + // state of the cluster exactly. + By("orphaning and adopting the cluster", func() { + AKSAdoptSpec(ctx, func() AKSAdoptSpecInput { + return AKSAdoptSpecInput{ + ApplyInput: clusterTemplate, + ApplyResult: result, + Cluster: result.Cluster, + MachinePools: result.MachinePools, + } + }) + }) By("adding an AKS marketplace extension", func() { AKSMarketplaceExtensionSpec(ctx, func() AKSMarketplaceExtensionSpecInput {