From a53d048bae862b5f2c88aaeaee5b8f26c1b46dea Mon Sep 17 00:00:00 2001
From: Kubernetes Prow Robot <k8s-ci-robot@users.noreply.github.com>
Date: Thu, 19 Aug 2021 11:35:25 -0700
Subject: [PATCH] Merge pull request #4274 from
 kinvolk/imran/cloud-provider-packet-fix

Cloud provider[Packet] fixes
---
 .../cloudprovider/packet/README.md            | 29 +++++++++++++
 .../packet/packet_cloud_provider.go           | 13 +++++-
 .../packet/packet_manager_rest.go             | 42 +++++++++++++++++--
 3 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/packet/README.md b/cluster-autoscaler/cloudprovider/packet/README.md
index d78ada775d75..c43d7f0d83be 100644
--- a/cluster-autoscaler/cloudprovider/packet/README.md
+++ b/cluster-autoscaler/cloudprovider/packet/README.md
@@ -78,6 +78,35 @@ affinity:
           - t1.small.x86
 ```
 
+## CCM and Controller node labels
+
+### CCM
+By default, autoscaler assumes that you have an older deprecated version of `packet-ccm` installed in your
+cluster.  If however, that is not the case and you've migrated to the new `cloud-provider-equinix-metal` CCM,
+then this must be told to autoscaler. This can be done via setting an environment variable in the deployment:
+```
+env:
+  - name: INSTALLED_CCM
+    value: cloud-provider-equinix-metal
+```
+**NOTE**: As a prerequisite, ensure that all worker nodes in your cluster have the prefix `equinixmetal://` in
+the Node spec `.spec.providerID`. If there are any existing worker nodes with prefix `packet://`, then drain
+the node, remove the node and restart the kubelet on that worker node to re-register the node in the cluster,
+this would ensure that `cloud-provider-equinix-metal` CCM sets the uuid with prefix `equinixmetal://` to the
+field `.spec.ProviderID`.
+
+### Controller node labels
+
+Autoscaler assumes that control plane nodes in your cluster are identified by the label
+`node-role.kubernetes.io/master`. If for some reason, this assumption is not true in your case, then set the
+envirnment variable in the deployment:
+
+```
+env:
+  - name: PACKET_CONTROLLER_NODE_IDENTIFIER_LABEL
+    value: <label>
+```
+
 ## Notes
 
 The autoscaler will not remove nodes which have non-default kube-system pods.
diff --git a/cluster-autoscaler/cloudprovider/packet/packet_cloud_provider.go b/cluster-autoscaler/cloudprovider/packet/packet_cloud_provider.go
index 32e28e7e6ad8..ca59a84f3d23 100644
--- a/cluster-autoscaler/cloudprovider/packet/packet_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/packet/packet_cloud_provider.go
@@ -37,6 +37,11 @@ const (
 	ProviderName = "packet"
 	// GPULabel is the label added to nodes with GPU resource.
 	GPULabel = "cloud.google.com/gke-accelerator"
+	// DefaultControllerNodeLabelKey is the label added to Master/Controller to identify as
+	// master/controller node.
+	DefaultControllerNodeLabelKey = "node-role.kubernetes.io/master"
+	// ControllerNodeIdentifierEnv is the string for the environment variable.
+	ControllerNodeIdentifierEnv = "PACKET_CONTROLLER_NODE_IDENTIFIER_LABEL"
 )
 
 var (
@@ -94,7 +99,13 @@ func (pcp *packetCloudProvider) AddNodeGroup(group packetNodeGroup) {
 //
 // Since only a single node group is currently supported, the first node group is always returned.
 func (pcp *packetCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovider.NodeGroup, error) {
-	if _, found := node.ObjectMeta.Labels["node-role.kubernetes.io/master"]; found {
+	controllerNodeLabel := os.Getenv(ControllerNodeIdentifierEnv)
+	if controllerNodeLabel == "" {
+		klog.V(3).Infof("env %s not set, using default: %s", ControllerNodeIdentifierEnv, DefaultControllerNodeLabelKey)
+		controllerNodeLabel = DefaultControllerNodeLabelKey
+	}
+
+	if _, found := node.ObjectMeta.Labels[controllerNodeLabel]; found {
 		return nil, nil
 	}
 	nodeGroupId, err := pcp.packetManager.NodeGroupForNode(node.ObjectMeta.Labels, node.Spec.ProviderID)
diff --git a/cluster-autoscaler/cloudprovider/packet/packet_manager_rest.go b/cluster-autoscaler/cloudprovider/packet/packet_manager_rest.go
index 9963b2142dd1..a30428603990 100644
--- a/cluster-autoscaler/cloudprovider/packet/packet_manager_rest.go
+++ b/cluster-autoscaler/cloudprovider/packet/packet_manager_rest.go
@@ -42,6 +42,11 @@ import (
 	schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
 )
 
+const (
+	packetPrefix                 = "packet://"
+	equinixMetalPrefix           = "equinixmetal://"
+)
+
 type instanceType struct {
 	InstanceName string
 	CPU          int64
@@ -275,7 +280,12 @@ func Contains(a []string, x string) bool {
 // createPacketManagerRest sets up the client and returns
 // an packetManagerRest.
 func createPacketManagerRest(configReader io.Reader, discoverOpts cloudprovider.NodeGroupDiscoveryOptions, opts config.AutoscalingOptions) (*packetManagerRest, error) {
-	var cfg ConfigFile
+	// Initialize ConfigFile instance
+	cfg := ConfigFile{
+		DefaultNodegroupdef: ConfigNodepool{},
+		Nodegroupdef:        map[string]*ConfigNodepool{},
+	}
+
 	if configReader != nil {
 		if err := gcfg.ReadInto(&cfg, configReader); err != nil {
 			klog.Errorf("Couldn't read config: %v", err)
@@ -395,7 +405,11 @@ func (mgr *packetManagerRest) NodeGroupForNode(labels map[string]string, nodeId
 	if nodegroup, ok := labels["pool"]; ok {
 		return nodegroup, nil
 	}
-	device, err := mgr.getPacketDevice(strings.TrimPrefix(nodeId, "packet://"))
+
+	trimmedNodeId := strings.TrimPrefix(nodeId, packetPrefix)
+	trimmedNodeId = strings.TrimPrefix(trimmedNodeId, equinixMetalPrefix)
+
+	device, err := mgr.getPacketDevice(trimmedNodeId)
 	if err != nil {
 		return "", fmt.Errorf("Could not find group for node: %s %s", nodeId, err)
 	}
@@ -540,9 +554,31 @@ func (mgr *packetManagerRest) getNodes(nodegroup string) ([]string, error) {
 	// Get node ProviderIDs by getting device IDs from Packet
 	devices, err := mgr.listPacketDevices()
 	nodes := []string{}
+
+	// This bit of code along with the switch statement, checks if the CCM installed on the cluster is
+	// `packet-ccm` or `cloud-provider-equinix-metal`. The reason its important to check because depending
+	// on the CCM installed, the prefix in providerID of K8s Node spec differs from either `packet://` or
+	// `equinixmetal://`. This is now needed as `packet-ccm` is now deprecated and renamed in favor of
+	// `cloud-provider-equinix-metal`.
+	// This code checks if the INSTALLED_CCM env var is set or not. If set to `cloud-provider-equinix-metal`,
+	// the prefix is set to `equinixmetal://` and any other case the prefix is `packet://`.
+	// At a later point in time, there would be a need to make `equinixmetal://` prefix as the default or do away
+	// with `packet://` prefix entirely. This should happen presumably when the packet code in autoscaler is
+	// renamed from packet to equinixmetal.
+	prefix := packetPrefix
+
+	switch installedCCM := os.Getenv("INSTALLED_CCM"); installedCCM {
+	case "packet-ccm":
+		prefix = packetPrefix
+	case "cloud-provider-equinix-metal":
+		prefix = equinixMetalPrefix
+	default:
+		klog.V(3).Info("Unrecognized value: expected INSTALLED_CCM to be either `packet-ccm` or `cloud-provider-equinix-metal`, using default: `packet-ccm`")
+	}
+
 	for _, d := range devices.Devices {
 		if Contains(d.Tags, "k8s-cluster-"+mgr.getNodePoolDefinition(nodegroup).clusterName) && Contains(d.Tags, "k8s-nodepool-"+nodegroup) {
-			nodes = append(nodes, fmt.Sprintf("packet://%s", d.ID))
+			nodes = append(nodes, fmt.Sprintf("%s%s", prefix, d.ID))
 		}
 	}
 	return nodes, err