UPSTREAM: <carry>: improve replica counting on openshift

This change adds logic to count the number of owned machines by each machineset when calculating the replica count to the core autoscaler. It is needed because the machine-api controllers do not include machines in deleting phase when updating their replica field. This causes a problem with the core autoscaler as the count of nodes will not match the resources from the cloud provider. This can be removed when the machine-api controllers have been fully removed from openshift.
openshift · Jan 8, 2024 · 32770e2 · 32770e2
1 parent af43b28
commit 32770e2
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 0 deletions.
diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go
@@ -46,6 +46,7 @@ const (
 	machinePoolProviderIDIndex = "machinePoolProviderIDIndex"
 	nodeProviderIDIndex        = "nodeProviderIDIndex"
 	defaultCAPIGroup           = "cluster.x-k8s.io"
+	openshiftMAPIGroup         = "machine.openshift.io"
 	// CAPIGroupEnvVar contains the environment variable name which allows overriding defaultCAPIGroup.
 	CAPIGroupEnvVar = "CAPI_GROUP"
 	// CAPIVersionEnvVar contains the environment variable name which allows overriding the Cluster API group version.

diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go
@@ -18,6 +18,7 @@ package clusterapi
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"path"
 	"strings"
@@ -94,6 +95,13 @@ func (r unstructuredScalableResource) Replicas() (int, error) {
 		return 0, err
 	}
 
+	// this function needs to differentiate between machine-api and cluster-api
+	// due to the fact that the machine-api controllers exclude machines in
+	// deleting phase when calculating replicas.
+	if gvr.Group == openshiftMAPIGroup {
+		return r.replicasOpenshift()
+	}
+
 	s, err := r.controller.managementScaleClient.Scales(r.Namespace()).Get(context.TODO(), gvr.GroupResource(), r.Name(), metav1.GetOptions{})
 	if err != nil {
 		return 0, err
@@ -105,6 +113,60 @@ func (r unstructuredScalableResource) Replicas() (int, error) {
 	return int(s.Spec.Replicas), nil
 }
 
+func (r unstructuredScalableResource) replicasOpenshift() (int, error) {
+	gvr, err := r.GroupVersionResource()
+	if err != nil {
+		return 0, err
+	}
+
+	if gvr.Group != openshiftMAPIGroup {
+		return 0, fmt.Errorf("incorrect group for replica count on %s %s/%s", r.Kind(), r.Namespace(), r.Name())
+	}
+
+	// get the selector labels from the scalable resource to find the machines
+	rawSelector, found, err := unstructured.NestedMap(r.unstructured.Object, "spec", "selector")
+	if !found || err != nil {
+		return 0, err
+	}
+
+	// we want to massage the unstructured selector data into a LabelSelector struct
+	// so that we can more easily create the necessary string for the ListOptions struct,
+	// the following code helps with that.
+	data, err := json.Marshal(rawSelector)
+	if err != nil {
+		return 0, err
+	}
+
+	var labelSelector metav1.LabelSelector
+	err = json.Unmarshal(data, &labelSelector)
+	if err != nil {
+		return 0, err
+	}
+
+	selector, err := metav1.LabelSelectorAsSelector(&labelSelector)
+	if err != nil {
+		return 0, err
+	}
+
+	// get a list of machines filtered by the namespace and the selector labels from the scalable resource
+	machinesList, err := r.controller.managementClient.Resource(r.controller.machineResource).Namespace(r.Namespace()).List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()})
+	if err != nil {
+		return 0, err
+	}
+
+	// filter out inactive machines
+	var activeMachines []unstructured.Unstructured
+	for _, item := range machinesList.Items {
+		if metav1.GetControllerOf(&item) != nil && !metav1.IsControlledBy(&item, r.unstructured) {
+			continue
+		}
+
+		activeMachines = append(activeMachines, item)
+	}
+
+	return len(activeMachines), nil
+}
+
 func (r unstructuredScalableResource) SetSize(nreplicas int) error {
 	switch {
 	case nreplicas > r.maxSize: