fix: stop unexpected rollouts (#324)

When making any node group changes, we were previously making an entire change of all the cluster resources. With this change, we're instead simply making the small changes in the node group which should avoid unexpected full rollouts of node groups when changing min/max for autoscaling or resizing a cluster. Signed-off-by: Mohammed Naser <[email protected]>
vexxhost · Mar 17, 2024 · 78f8a02 · 78f8a02
1 parent be7d18c
commit 78f8a02
Show file tree

Hide file tree

Showing 7 changed files with 486 additions and 62 deletions.
diff --git a/docs/developer/cluster-topology.md b/docs/developer/cluster-topology.md
@@ -0,0 +1,12 @@
+# Cluster Topology
+
+The Cluster API driver for Magnum makes use of the Cluster topology feature of the Cluster API project.  This allows it to delegate all of the work around building resources such as the `OpenStackCluster`, `MachineDeployments` and everything else managed entire by the Cluster API instead of the driver creating all of these resources.
+
+In order to do this, the driver creates a [`ClusterClass`](https://cluster-api.sigs.k8s.io/tasks/experimental-features/cluster-class/write-clusterclass) resource which is called `magnum-v{VERSION}` where `{VERSION}` is the current version of the driver because of the following reasons:
+
+- This allows us to have multiple different versions of the `ClusterClass` because it is an immutable resource.
+- This prevents causing a rollout of existing clusters should a change happen to the underlying `ClusterClass`.
+
+It's important to note that there are only _one_ scenarios where the `spec.topology.class` for a given `Cluster` will be modified and this will be when a cluster upgrade is done.  This is because there is an expectation by the user that a rolling restart operation will occur if a cluster upgrade is requested.  No other action should be allowed to change the `spec.topology.class` of a `Cluster`.
+
+For users, it's important to keep in mind that if they want to use a newer `ClusterClass` in order to make sure of a new feature available in a newer `ClusterClass`, they can simply do an upgrade within Magnum to the same cluster template and it will actually force an update of the `spec.topology.class`, which might then naturally cause a full rollout to occur.
diff --git a/flake.nix b/flake.nix
@@ -15,9 +15,11 @@
         {
           devShell = pkgs.mkShell
             {
-              # LD_LIBRARY_PATH = "${pkgs.stdenv.cc.cc.lib}/lib";
+              LD_LIBRARY_PATH = "${pkgs.stdenv.cc.cc.lib}/lib";
 
               buildInputs = with pkgs; [
+                bashInteractive
+                glibcLocales
                 poetry
               ];
             };

diff --git a/magnum_cluster_api/driver.py b/magnum_cluster_api/driver.py
@@ -13,6 +13,8 @@
 # under the License.
 
 
+from __future__ import annotations
+
 import keystoneauth1
 from magnum import objects as magnum_objects
 from magnum.conductor import scale_manager
@@ -239,9 +241,6 @@ def resize_cluster(
     ):
         utils.validate_cluster(context, cluster)
 
-        if nodegroup is None:
-            nodegroup = cluster.default_ng_worker
-
         if nodes_to_remove:
             machines = objects.Machine.objects(self.k8s_api).filter(
                 namespace="magnum-system",
@@ -260,12 +259,7 @@ def resize_cluster(
                     ] = "yes"
                     machine.update()
 
-        nodegroup.node_count = node_count
-        nodegroup.save()
-
-        resources.apply_cluster_from_magnum_cluster(
-            context, self.k8s_api, cluster, skip_auto_scaling_release=True
-        )
+        self._update_nodegroup(context, cluster, nodegroup)
 
     @cluster_lock_wrapper
     def upgrade_cluster(
@@ -341,10 +335,29 @@ def create_nodegroup(
         nodegroup: magnum_objects.NodeGroup,
     ):
         utils.validate_nodegroup(nodegroup, context)
-        resources.apply_cluster_from_magnum_cluster(
-            context, self.k8s_api, cluster, skip_auto_scaling_release=True
+
+        cluster_resource: objects.Cluster = objects.Cluster.objects(
+            self.k8s_api, namespace="magnum-system"
+        ).get(name=cluster.stack_id)
+
+        cluster_resource.obj["spec"]["topology"]["workers"][
+            "machineDeployments"
+        ].append(resources.mutate_machine_deployment(context, cluster, nodegroup))
+
+        current_generation = resources.Cluster(
+            context, self.k8s_api, cluster
+        ).get_observed_generation()
+        cluster_resource.update()
+        self.wait_capi_cluster_reconciliation_start(
+            context, cluster, current_generation
         )
 
+        nodegroup.status = fields.ClusterStatus.CREATE_IN_PROGRESS
+        nodegroup.save()
+
+        cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
+        cluster.save()
+
     def update_nodegroup_status(
         self,
         context,
@@ -402,18 +415,53 @@ def update_nodegroup(
         cluster: magnum_objects.Cluster,
         nodegroup: magnum_objects.NodeGroup,
     ):
-        # TODO
+        self._update_nodegroup(context, cluster, nodegroup)
 
-        # NOTE(okozachenko1203): First we save the nodegroup status because update_cluster_status()
-        #                        could be finished before update_nodegroup().
-        nodegroup.save()
+    def _update_nodegroup(
+        self,
+        context,
+        cluster: magnum_objects.Cluster,
+        nodegroup: magnum_objects.NodeGroup,
+    ):
         utils.validate_nodegroup(nodegroup, context)
-        resources.apply_cluster_from_magnum_cluster(
-            context, self.k8s_api, cluster, skip_auto_scaling_release=True
+
+        cluster_resource: objects.Cluster = objects.Cluster.objects(
+            self.k8s_api, namespace="magnum-system"
+        ).get(name=cluster.stack_id)
+
+        machine_deployment_index = None
+        for i, machine_deployment in enumerate(
+            cluster_resource.obj["spec"]["topology"]["workers"]["machineDeployments"]
+        ):
+            if machine_deployment["name"] == nodegroup.name:
+                machine_deployment_index = i
+                break
+
+        if machine_deployment_index is not None:
+            machine_deployment = cluster_resource.obj["spec"]["topology"]["workers"][
+                "machineDeployments"
+            ][machine_deployment_index]
+
+            cluster_resource.obj["spec"]["topology"]["workers"]["machineDeployments"][
+                machine_deployment_index
+            ] = resources.mutate_machine_deployment(
+                context,
+                cluster,
+                nodegroup,
+                machine_deployment,
+            )
+
+        current_generation = resources.Cluster(
+            context, self.k8s_api, cluster
+        ).get_observed_generation()
+        cluster_resource.update()
+        self.wait_capi_cluster_reconciliation_start(
+            context, cluster, current_generation
         )
-        # NOTE(okozachenko1203): We set the cluster status as UPDATE_IN_PROGRESS again at the end because
-        #                        update_cluster_status() could be finished and cluster status has been set as
-        #                        UPDATE_COMPLETE before nodegroup_conductor.Handler.nodegroup_update finished.
+
+        nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
+        nodegroup.save()
+
         cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
         cluster.save()
 
@@ -424,15 +472,36 @@ def delete_nodegroup(
         cluster: magnum_objects.Cluster,
         nodegroup: magnum_objects.NodeGroup,
     ):
+        cluster_resource: objects.Cluster = objects.Cluster.objects(
+            self.k8s_api, namespace="magnum-system"
+        ).get(name=cluster.stack_id)
+
+        machine_deployment_index = None
+        for i, machine_deployment in enumerate(
+            cluster_resource.obj["spec"]["topology"]["workers"]["machineDeployments"]
+        ):
+            if machine_deployment["name"] == nodegroup.name:
+                machine_deployment_index = i
+                break
+
+        if machine_deployment_index is not None:
+            del cluster_resource.obj["spec"]["topology"]["workers"][
+                "machineDeployments"
+            ][machine_deployment_index]
+
+        current_generation = resources.Cluster(
+            context, self.k8s_api, cluster
+        ).get_observed_generation()
+        cluster_resource.update()
+        self.wait_capi_cluster_reconciliation_start(
+            context, cluster, current_generation
+        )
+
         nodegroup.status = fields.ClusterStatus.DELETE_IN_PROGRESS
         nodegroup.save()
 
-        resources.apply_cluster_from_magnum_cluster(
-            context,
-            self.k8s_api,
-            cluster,
-            skip_auto_scaling_release=True,
-        )
+        cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
+        cluster.save()
 
     @cluster_lock_wrapper
     def get_monitor(self, context, cluster: magnum_objects.Cluster):

diff --git a/magnum_cluster_api/resources.py b/magnum_cluster_api/resources.py
@@ -1993,36 +1993,68 @@ def create_cluster_class(
     ClusterClass(api).apply()
 
 
-def generate_machine_deployments_for_cluster(
-    context: context.RequestContext, cluster: objects.Cluster
-) -> list:
+def mutate_machine_deployment(
+    context: context.RequestContext,
+    cluster: objects.Cluster,
+    node_group: magnum_objects.NodeGroup,
+    machine_deployment: dict = {},
+):
+    """
+    This function will either makes updates to machine deployment fields which
+    will not cause a rolling update or will return a new machine deployment
+    if none is provided.
+    """
+
     auto_scaling_enabled = utils.get_auto_scaling_enabled(cluster)
 
-    machine_deployments = []
-    for ng in cluster.nodegroups:
-        if ng.role == "master" or ng.status.startswith("DELETE"):
-            continue
+    machine_deployment.setdefault(
+        "metadata",
+        {
+            "annotations": {},
+            "labels": {},
+        },
+    )
+
+    # Node labels
+    machine_deployment["metadata"]["labels"] = {
+        f"node-role.kubernetes.io/{node_group.role}": "",
+        "node.cluster.x-k8s.io/nodegroup": node_group.name,
+    }
+
+    # Replicas (or min/max if auto-scaling is enabled)
+    if auto_scaling_enabled:
+        machine_deployment["replicas"] = None
+        machine_deployment["metadata"]["annotations"] = {
+            AUTOSCALE_ANNOTATION_MIN: str(
+                utils.get_node_group_min_node_count(node_group)
+            ),
+            AUTOSCALE_ANNOTATION_MAX: str(
+                utils.get_node_group_max_node_count(context, node_group)
+            ),
+        }
+    else:
+        machine_deployment["replicas"] = node_group.node_count
+        machine_deployment["metadata"]["annotations"] = {}
 
-        machine_deployment = {
+    # Fixes
+    machine_deployment["nodeVolumeDetachTimeout"] = (
+        CLUSTER_CLASS_NODE_VOLUME_DETACH_TIMEOUT
+    )
+
+    # Anything beyond this point will *NOT* be changed in the machine deployment
+    # for update operations (i.e. if the machine deployment already exists).
+    if machine_deployment.get("name") == node_group.name:
+        return machine_deployment
+
+    # At this point, this is all code that will be added for brand new machine
+    # deployments.  We can bring any of this code into the above block if we
+    # want to change it for existing machine deployments.
+    machine_deployment.update(
+        {
             "class": "default-worker",
-            "nodeVolumeDetachTimeout": CLUSTER_CLASS_NODE_VOLUME_DETACH_TIMEOUT,
-            "name": ng.name,
-            "metadata": {
-                "annotations": (
-                    {
-                        AUTOSCALE_ANNOTATION_MIN: f"{utils.get_node_group_min_node_count(ng)}",  # noqa: E501
-                        AUTOSCALE_ANNOTATION_MAX: f"{utils.get_node_group_max_node_count(context, ng)}",  # noqa: E501
-                    }
-                    if auto_scaling_enabled
-                    else {}
-                ),
-                "labels": {
-                    f"node-role.kubernetes.io/{ng.role}": "",
-                    "node.cluster.x-k8s.io/nodegroup": ng.name,
-                },
-            },
+            "name": node_group.name,
             "failureDomain": utils.get_node_group_label(
-                context, ng, "availability_zone", ""
+                context, node_group, "availability_zone", ""
             ),
             "machineHealthCheck": {
                 "enable": utils.get_cluster_label_as_bool(
@@ -2036,45 +2068,52 @@ def generate_machine_deployments_for_cluster(
                         "value": {
                             "size": utils.get_node_group_label_as_int(
                                 context,
-                                ng,
+                                node_group,
                                 "boot_volume_size",
                                 CONF.cinder.default_boot_volume_size,
                             ),
                             "type": utils.get_node_group_label(
                                 context,
-                                ng,
+                                node_group,
                                 "boot_volume_type",
                                 cinder.get_default_boot_volume_type(context),
                             ),
                         },
                     },
                     {
                         "name": "flavor",
-                        "value": ng.flavor_id,
+                        "value": node_group.flavor_id,
                     },
                     {
                         "name": "imageRepository",
                         "value": utils.get_node_group_label(
                             context,
-                            ng,
+                            node_group,
                             "container_infra_prefix",
                             "",
                         ),
                     },
                     {
                         "name": "imageUUID",
-                        "value": utils.get_image_uuid(ng.image_id, context),
+                        "value": utils.get_image_uuid(node_group.image_id, context),
                     },
                 ],
             },
         }
+    )
+
+    return machine_deployment
 
-        # NOTE(mnaser): In order for auto-scaler and cluster class reconcilers
-        #               to not fight each other, we only set replicas if the
-        #               auto-scaler is disabled.
-        if not auto_scaling_enabled:
-            machine_deployment["replicas"] = ng.node_count
 
+def generate_machine_deployments_for_cluster(
+    context: context.RequestContext, cluster: objects.Cluster
+) -> list:
+    machine_deployments = []
+    for ng in cluster.nodegroups:
+        if ng.role == "master" or ng.status.startswith("DELETE"):
+            continue
+
+        machine_deployment = mutate_machine_deployment(context, cluster, ng)
         machine_deployments.append(machine_deployment)
 
     return machine_deployments

diff --git a/magnum_cluster_api/tests/unit/conftest.py b/magnum_cluster_api/tests/unit/conftest.py
@@ -15,6 +15,8 @@
 import pytest
 from magnum.common import context as magnum_context
 
+from magnum_cluster_api import driver
+
 
 @pytest.fixture
 def context():
@@ -26,3 +28,26 @@ def context():
         user_id="fake_user",
         is_admin=False,
     )
+
+
+@pytest.fixture(scope="session")
+def mock_pykube(session_mocker):
+    session_mocker.patch("pykube.KubeConfig")
+    session_mocker.patch("pykube.HTTPClient")
+
+
+@pytest.fixture(scope="session")
+def mock_cluster_lock(session_mocker):
+    session_mocker.patch("kubernetes.config.load_config")
+    session_mocker.patch("magnum_cluster_api.sync.ClusterLock.acquire")
+    session_mocker.patch("magnum_cluster_api.sync.ClusterLock.release")
+
+
+@pytest.fixture(scope="session")
+def mock_validate_nodegroup(session_mocker):
+    session_mocker.patch("magnum_cluster_api.utils.validate_nodegroup")
+
+
+@pytest.fixture()
+def ubuntu_driver(mock_cluster_lock, mock_pykube):
+    yield driver.UbuntuDriver()