From a53d048bae862b5f2c88aaeaee5b8f26c1b46dea Mon Sep 17 00:00:00 2001 From: Kubernetes Prow Robot <k8s-ci-robot@users.noreply.github.com> Date: Thu, 19 Aug 2021 11:35:25 -0700 Subject: [PATCH] Merge pull request #4274 from kinvolk/imran/cloud-provider-packet-fix Cloud provider[Packet] fixes --- .../cloudprovider/packet/README.md | 29 +++++++++++++ .../packet/packet_cloud_provider.go | 13 +++++- .../packet/packet_manager_rest.go | 42 +++++++++++++++++-- 3 files changed, 80 insertions(+), 4 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/packet/README.md b/cluster-autoscaler/cloudprovider/packet/README.md index d78ada775d75..c43d7f0d83be 100644 --- a/cluster-autoscaler/cloudprovider/packet/README.md +++ b/cluster-autoscaler/cloudprovider/packet/README.md @@ -78,6 +78,35 @@ affinity: - t1.small.x86 ``` +## CCM and Controller node labels + +### CCM +By default, autoscaler assumes that you have an older deprecated version of `packet-ccm` installed in your +cluster. If however, that is not the case and you've migrated to the new `cloud-provider-equinix-metal` CCM, +then this must be told to autoscaler. This can be done via setting an environment variable in the deployment: +``` +env: + - name: INSTALLED_CCM + value: cloud-provider-equinix-metal +``` +**NOTE**: As a prerequisite, ensure that all worker nodes in your cluster have the prefix `equinixmetal://` in +the Node spec `.spec.providerID`. If there are any existing worker nodes with prefix `packet://`, then drain +the node, remove the node and restart the kubelet on that worker node to re-register the node in the cluster, +this would ensure that `cloud-provider-equinix-metal` CCM sets the uuid with prefix `equinixmetal://` to the +field `.spec.ProviderID`. + +### Controller node labels + +Autoscaler assumes that control plane nodes in your cluster are identified by the label +`node-role.kubernetes.io/master`. If for some reason, this assumption is not true in your case, then set the +envirnment variable in the deployment: + +``` +env: + - name: PACKET_CONTROLLER_NODE_IDENTIFIER_LABEL + value: <label> +``` + ## Notes The autoscaler will not remove nodes which have non-default kube-system pods. diff --git a/cluster-autoscaler/cloudprovider/packet/packet_cloud_provider.go b/cluster-autoscaler/cloudprovider/packet/packet_cloud_provider.go index 32e28e7e6ad8..ca59a84f3d23 100644 --- a/cluster-autoscaler/cloudprovider/packet/packet_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/packet/packet_cloud_provider.go @@ -37,6 +37,11 @@ const ( ProviderName = "packet" // GPULabel is the label added to nodes with GPU resource. GPULabel = "cloud.google.com/gke-accelerator" + // DefaultControllerNodeLabelKey is the label added to Master/Controller to identify as + // master/controller node. + DefaultControllerNodeLabelKey = "node-role.kubernetes.io/master" + // ControllerNodeIdentifierEnv is the string for the environment variable. + ControllerNodeIdentifierEnv = "PACKET_CONTROLLER_NODE_IDENTIFIER_LABEL" ) var ( @@ -94,7 +99,13 @@ func (pcp *packetCloudProvider) AddNodeGroup(group packetNodeGroup) { // // Since only a single node group is currently supported, the first node group is always returned. func (pcp *packetCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovider.NodeGroup, error) { - if _, found := node.ObjectMeta.Labels["node-role.kubernetes.io/master"]; found { + controllerNodeLabel := os.Getenv(ControllerNodeIdentifierEnv) + if controllerNodeLabel == "" { + klog.V(3).Infof("env %s not set, using default: %s", ControllerNodeIdentifierEnv, DefaultControllerNodeLabelKey) + controllerNodeLabel = DefaultControllerNodeLabelKey + } + + if _, found := node.ObjectMeta.Labels[controllerNodeLabel]; found { return nil, nil } nodeGroupId, err := pcp.packetManager.NodeGroupForNode(node.ObjectMeta.Labels, node.Spec.ProviderID) diff --git a/cluster-autoscaler/cloudprovider/packet/packet_manager_rest.go b/cluster-autoscaler/cloudprovider/packet/packet_manager_rest.go index 9963b2142dd1..a30428603990 100644 --- a/cluster-autoscaler/cloudprovider/packet/packet_manager_rest.go +++ b/cluster-autoscaler/cloudprovider/packet/packet_manager_rest.go @@ -42,6 +42,11 @@ import ( schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1" ) +const ( + packetPrefix = "packet://" + equinixMetalPrefix = "equinixmetal://" +) + type instanceType struct { InstanceName string CPU int64 @@ -275,7 +280,12 @@ func Contains(a []string, x string) bool { // createPacketManagerRest sets up the client and returns // an packetManagerRest. func createPacketManagerRest(configReader io.Reader, discoverOpts cloudprovider.NodeGroupDiscoveryOptions, opts config.AutoscalingOptions) (*packetManagerRest, error) { - var cfg ConfigFile + // Initialize ConfigFile instance + cfg := ConfigFile{ + DefaultNodegroupdef: ConfigNodepool{}, + Nodegroupdef: map[string]*ConfigNodepool{}, + } + if configReader != nil { if err := gcfg.ReadInto(&cfg, configReader); err != nil { klog.Errorf("Couldn't read config: %v", err) @@ -395,7 +405,11 @@ func (mgr *packetManagerRest) NodeGroupForNode(labels map[string]string, nodeId if nodegroup, ok := labels["pool"]; ok { return nodegroup, nil } - device, err := mgr.getPacketDevice(strings.TrimPrefix(nodeId, "packet://")) + + trimmedNodeId := strings.TrimPrefix(nodeId, packetPrefix) + trimmedNodeId = strings.TrimPrefix(trimmedNodeId, equinixMetalPrefix) + + device, err := mgr.getPacketDevice(trimmedNodeId) if err != nil { return "", fmt.Errorf("Could not find group for node: %s %s", nodeId, err) } @@ -540,9 +554,31 @@ func (mgr *packetManagerRest) getNodes(nodegroup string) ([]string, error) { // Get node ProviderIDs by getting device IDs from Packet devices, err := mgr.listPacketDevices() nodes := []string{} + + // This bit of code along with the switch statement, checks if the CCM installed on the cluster is + // `packet-ccm` or `cloud-provider-equinix-metal`. The reason its important to check because depending + // on the CCM installed, the prefix in providerID of K8s Node spec differs from either `packet://` or + // `equinixmetal://`. This is now needed as `packet-ccm` is now deprecated and renamed in favor of + // `cloud-provider-equinix-metal`. + // This code checks if the INSTALLED_CCM env var is set or not. If set to `cloud-provider-equinix-metal`, + // the prefix is set to `equinixmetal://` and any other case the prefix is `packet://`. + // At a later point in time, there would be a need to make `equinixmetal://` prefix as the default or do away + // with `packet://` prefix entirely. This should happen presumably when the packet code in autoscaler is + // renamed from packet to equinixmetal. + prefix := packetPrefix + + switch installedCCM := os.Getenv("INSTALLED_CCM"); installedCCM { + case "packet-ccm": + prefix = packetPrefix + case "cloud-provider-equinix-metal": + prefix = equinixMetalPrefix + default: + klog.V(3).Info("Unrecognized value: expected INSTALLED_CCM to be either `packet-ccm` or `cloud-provider-equinix-metal`, using default: `packet-ccm`") + } + for _, d := range devices.Devices { if Contains(d.Tags, "k8s-cluster-"+mgr.getNodePoolDefinition(nodegroup).clusterName) && Contains(d.Tags, "k8s-nodepool-"+nodegroup) { - nodes = append(nodes, fmt.Sprintf("packet://%s", d.ID)) + nodes = append(nodes, fmt.Sprintf("%s%s", prefix, d.ID)) } } return nodes, err