From 32770e23a7f91bd59c50ee855f65c3109c91fd41 Mon Sep 17 00:00:00 2001 From: michael mccune Date: Fri, 5 Jan 2024 16:49:43 -0500 Subject: [PATCH] UPSTREAM: : improve replica counting on openshift This change adds logic to count the number of owned machines by each machineset when calculating the replica count to the core autoscaler. It is needed because the machine-api controllers do not include machines in deleting phase when updating their replica field. This causes a problem with the core autoscaler as the count of nodes will not match the resources from the cloud provider. This can be removed when the machine-api controllers have been fully removed from openshift. --- .../clusterapi/clusterapi_controller.go | 1 + .../clusterapi/clusterapi_unstructured.go | 62 +++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go index e6296ea46807..8ef245b28261 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go @@ -46,6 +46,7 @@ const ( machinePoolProviderIDIndex = "machinePoolProviderIDIndex" nodeProviderIDIndex = "nodeProviderIDIndex" defaultCAPIGroup = "cluster.x-k8s.io" + openshiftMAPIGroup = "machine.openshift.io" // CAPIGroupEnvVar contains the environment variable name which allows overriding defaultCAPIGroup. CAPIGroupEnvVar = "CAPI_GROUP" // CAPIVersionEnvVar contains the environment variable name which allows overriding the Cluster API group version. diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go index b9801f2575d2..b5254cf70582 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go @@ -18,6 +18,7 @@ package clusterapi import ( "context" + "encoding/json" "fmt" "path" "strings" @@ -94,6 +95,13 @@ func (r unstructuredScalableResource) Replicas() (int, error) { return 0, err } + // this function needs to differentiate between machine-api and cluster-api + // due to the fact that the machine-api controllers exclude machines in + // deleting phase when calculating replicas. + if gvr.Group == openshiftMAPIGroup { + return r.replicasOpenshift() + } + s, err := r.controller.managementScaleClient.Scales(r.Namespace()).Get(context.TODO(), gvr.GroupResource(), r.Name(), metav1.GetOptions{}) if err != nil { return 0, err @@ -105,6 +113,60 @@ func (r unstructuredScalableResource) Replicas() (int, error) { return int(s.Spec.Replicas), nil } +func (r unstructuredScalableResource) replicasOpenshift() (int, error) { + gvr, err := r.GroupVersionResource() + if err != nil { + return 0, err + } + + if gvr.Group != openshiftMAPIGroup { + return 0, fmt.Errorf("incorrect group for replica count on %s %s/%s", r.Kind(), r.Namespace(), r.Name()) + } + + // get the selector labels from the scalable resource to find the machines + rawSelector, found, err := unstructured.NestedMap(r.unstructured.Object, "spec", "selector") + if !found || err != nil { + return 0, err + } + + // we want to massage the unstructured selector data into a LabelSelector struct + // so that we can more easily create the necessary string for the ListOptions struct, + // the following code helps with that. + data, err := json.Marshal(rawSelector) + if err != nil { + return 0, err + } + + var labelSelector metav1.LabelSelector + err = json.Unmarshal(data, &labelSelector) + if err != nil { + return 0, err + } + + selector, err := metav1.LabelSelectorAsSelector(&labelSelector) + if err != nil { + return 0, err + } + + // get a list of machines filtered by the namespace and the selector labels from the scalable resource + machinesList, err := r.controller.managementClient.Resource(r.controller.machineResource).Namespace(r.Namespace()).List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return 0, err + } + + // filter out inactive machines + var activeMachines []unstructured.Unstructured + for _, item := range machinesList.Items { + if metav1.GetControllerOf(&item) != nil && !metav1.IsControlledBy(&item, r.unstructured) { + continue + } + + activeMachines = append(activeMachines, item) + } + + return len(activeMachines), nil +} + func (r unstructuredScalableResource) SetSize(nreplicas int) error { switch { case nreplicas > r.maxSize: