From adbd0991b49bacba16849d7dad9fd8a27e6928bd Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Tue, 5 Mar 2024 13:51:00 +0000 Subject: [PATCH] node-controller require providerID to initialize a node Since the migration to the external cloud providers, the node controller in the cloud controller manager is responsible of initializing the nodes. There is a strong assumption across the ecosystem that the nodes has set the node.spec.providerID value, however, the node controller does not check if this value is set during the initialization of the node, and if there are some failures on the cloud provider API calls, the node can be untainted without the value and never reconciled. In addition, it seems that is possible for some cloud provider to not implement the providerID value, though is not likely this is going to happen, but for backward compatibility purposes we should allow this case. The node controller will require the providerID to untain the Nodes, except when the cloud provider does not use InstancesV2 and does implement it. ProviderID is inmutable once set, so that value has preferences, otherwise InstancesV2 is preferred over Instances. Change-Id: Ic41cf7ebcca1ff0fbd8daafc036166f19fc37251 Signed-off-by: Antonio Ojea Kubernetes-commit: 53d38a31611c324129abd9b879d175ff388d0159 --- controllers/node/node_controller.go | 89 +++++----- controllers/node/node_controller_test.go | 202 +++++++++++++++++++---- 2 files changed, 223 insertions(+), 68 deletions(-) diff --git a/controllers/node/node_controller.go b/controllers/node/node_controller.go index b31c04d4..9b402239 100644 --- a/controllers/node/node_controller.go +++ b/controllers/node/node_controller.go @@ -226,6 +226,7 @@ func (cnc *CloudNodeController) processNextWorkItem() bool { if err := cnc.syncHandler(key); err != nil { // Put the item back on the workqueue to handle any transient errors. cnc.workqueue.AddRateLimited(key) + klog.Infof("error syncing '%s': %v, requeuing", key, err) return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error()) } @@ -424,12 +425,8 @@ func (cnc *CloudNodeController) syncNode(ctx context.Context, nodeName string) e klog.Infof("Initializing node %s with cloud provider", nodeName) copyNode := curNode.DeepCopy() - providerID, err := cnc.getProviderID(ctx, copyNode) - if err != nil { - return fmt.Errorf("failed to get provider ID for node %s at cloudprovider: %v", nodeName, err) - } - instanceMetadata, err := cnc.getInstanceMetadata(ctx, providerID, copyNode) + instanceMetadata, err := cnc.getInstanceMetadata(ctx, copyNode) if err != nil { return fmt.Errorf("failed to get instance metadata for node %s: %v", nodeName, err) } @@ -439,7 +436,7 @@ func (cnc *CloudNodeController) syncNode(ctx context.Context, nodeName string) e return nil } - nodeModifiers, err := cnc.getNodeModifiersFromCloudProvider(ctx, providerID, copyNode, instanceMetadata) + nodeModifiers, err := cnc.getNodeModifiersFromCloudProvider(ctx, copyNode, instanceMetadata) if err != nil { return fmt.Errorf("failed to get node modifiers from cloud provider: %v", err) } @@ -510,16 +507,13 @@ func (cnc *CloudNodeController) syncNode(ctx context.Context, nodeName string) e // loop, meaning they could get called multiple times. func (cnc *CloudNodeController) getNodeModifiersFromCloudProvider( ctx context.Context, - providerID string, node *v1.Node, instanceMeta *cloudprovider.InstanceMetadata, ) ([]nodeModifier, error) { var nodeModifiers []nodeModifier if node.Spec.ProviderID == "" { - if providerID != "" { - nodeModifiers = append(nodeModifiers, func(n *v1.Node) { n.Spec.ProviderID = providerID }) - } else if instanceMeta.ProviderID != "" { + if instanceMeta.ProviderID != "" { nodeModifiers = append(nodeModifiers, func(n *v1.Node) { n.Spec.ProviderID = instanceMeta.ProviderID }) } } @@ -594,35 +588,33 @@ func (cnc *CloudNodeController) getNodeModifiersFromCloudProvider( return nodeModifiers, nil } -func (cnc *CloudNodeController) getProviderID(ctx context.Context, node *v1.Node) (string, error) { - if node.Spec.ProviderID != "" { - return node.Spec.ProviderID, nil - } - - if _, ok := cnc.cloud.InstancesV2(); ok { - // We don't need providerID when we call InstanceMetadata for InstancesV2 - return "", nil - } - - providerID, err := cloudprovider.GetInstanceProviderID(ctx, cnc.cloud, types.NodeName(node.Name)) - if err == cloudprovider.NotImplemented { - // if the cloud provider being used does not support provider IDs, - // we can safely continue since we will attempt to set node - // addresses given the node name in getNodeAddressesByProviderIDOrName - klog.Warningf("cloud provider does not set node provider ID, using node name to discover node %s", node.Name) - return "", nil - } - - // if the cloud provider being used supports provider IDs, we want - // to propagate the error so that we re-try in the future; if we - // do not, the taint will be removed, and this will not be retried - return providerID, err -} - -// getInstanceMetadata get instance type and nodeAddresses, use Instances if InstancesV2 is off. -func (cnc *CloudNodeController) getInstanceMetadata(ctx context.Context, providerID string, node *v1.Node) (*cloudprovider.InstanceMetadata, error) { +// getInstanceMetadata get providerdID, instance type and nodeAddresses, use Instances if InstancesV2 is off. +// ProviderID is expected to be available, but to keep backward compatibility, +// we should handle some scenarios where it can be missing. It returns an error +// if providerID is missing, except when is not implemented by GetInstanceProviderID. +func (cnc *CloudNodeController) getInstanceMetadata(ctx context.Context, node *v1.Node) (*cloudprovider.InstanceMetadata, error) { + // kubelet can set the provider ID using the flag and is inmutable + providerID := node.Spec.ProviderID + // InstancesV2 require ProviderID to be present if instancesV2, ok := cnc.cloud.InstancesV2(); instancesV2 != nil && ok { - return instancesV2.InstanceMetadata(ctx, node) + metadata, err := instancesV2.InstanceMetadata(ctx, node) + if err != nil { + return nil, err + } + // spec.ProviderID is required for multiple controllers, like loadbalancers, so we should not + // untaint the node until is set. Once it is set, the field is immutable, so no need to reconcile. + // We only set this value during initialization and is never reconciled, so if for some reason + // we are not able to set it, the instance will never be able to acquire it. + // Before external cloud providers were enabled by default, the field was set by the kubelet, and the + // node was created with the value. + // xref: https://issues.k8s.io/123024 + if metadata != nil && metadata.ProviderID == "" { + if providerID == "" { + return metadata, fmt.Errorf("cloud provider does not set node provider ID for node %s", node.Name) + } + metadata.ProviderID = providerID + } + return metadata, nil } // If InstancesV2 not implement, use Instances. @@ -630,6 +622,26 @@ func (cnc *CloudNodeController) getInstanceMetadata(ctx context.Context, provide if !ok { return nil, fmt.Errorf("failed to get instances from cloud provider") } + + var err error + if providerID == "" { + providerID, err = cloudprovider.GetInstanceProviderID(ctx, cnc.cloud, types.NodeName(node.Name)) + if err != nil { + // This is the only case where ProviderID can be skipped + if errors.Is(err, cloudprovider.NotImplemented) { + // if the cloud provider being used does not support provider IDs, + // we can safely continue since we will attempt to set node + // addresses given the node name in getNodeAddressesByProviderIDOrName + klog.Warningf("cloud provider does not set node provider ID, using node name to discover node %s", node.Name) + } else { + // if the cloud provider being used supports provider IDs, we want + // to propagate the error so that we re-try in the future; if we + // do not, the taint will be removed, and this will not be retried + return nil, err + } + } + } + nodeAddresses, err := getNodeAddressesByProviderIDOrName(ctx, instances, providerID, node.Name) if err != nil { return nil, err @@ -640,6 +652,7 @@ func (cnc *CloudNodeController) getInstanceMetadata(ctx context.Context, provide } instanceMetadata := &cloudprovider.InstanceMetadata{ + ProviderID: providerID, InstanceType: instanceType, NodeAddresses: nodeAddresses, } diff --git a/controllers/node/node_controller_test.go b/controllers/node/node_controller_test.go index 7f5e95e0..2f0800b6 100644 --- a/controllers/node/node_controller_test.go +++ b/controllers/node/node_controller_test.go @@ -50,6 +50,7 @@ func Test_syncNode(t *testing.T) { fakeCloud *fakecloud.Cloud existingNode *v1.Node updatedNode *v1.Node + expectedErr bool }{ { name: "node initialized with provider ID", @@ -545,7 +546,8 @@ func Test_syncNode(t *testing.T) { }, }, { - name: "provided node IP address is not valid", + name: "provided node IP address is not valid", + expectedErr: true, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: false, Addresses: []v1.NodeAddress{ @@ -643,7 +645,8 @@ func Test_syncNode(t *testing.T) { }, }, { - name: "provided node IP address is not present", + name: "provided node IP address is not present", + expectedErr: true, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: false, Addresses: []v1.NodeAddress{ @@ -835,8 +838,10 @@ func Test_syncNode(t *testing.T) { }, }, }, - { - name: "provider ID not implemented", + { // for backward compatibility the cloud providers that does not implement + // providerID does not block the node initialization + name: "provider ID not implemented", + expectedErr: false, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: false, InstanceTypes: map[types.NodeName]string{}, @@ -1641,7 +1646,8 @@ func Test_syncNode(t *testing.T) { }, }, { - name: "[instanceV2] provider ID not implemented", + name: "[instanceV2] provider ID not implemented", + expectedErr: true, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: true, InstanceTypes: map[types.NodeName]string{}, @@ -1693,12 +1699,19 @@ func Test_syncNode(t *testing.T) { }, }, Spec: v1.NodeSpec{ - Taints: []v1.Taint{}, + Taints: []v1.Taint{ + { + Key: cloudproviderapi.TaintExternalCloudProvider, + Value: "true", + Effect: v1.TaintEffectNoSchedule, + }, + }, }, }, }, { - name: "[instanceV2] error getting InstanceMetadata", + name: "[instanceV2] error getting InstanceMetadata", + expectedErr: true, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: true, InstanceTypes: map[types.NodeName]string{}, @@ -1786,7 +1799,10 @@ func Test_syncNode(t *testing.T) { w := eventBroadcaster.StartLogging(klog.Infof) defer w.Stop() - cloudNodeController.syncNode(context.TODO(), test.existingNode.Name) + err := cloudNodeController.syncNode(context.TODO(), test.existingNode.Name) + if (err != nil) != test.expectedErr { + t.Fatalf("error got: %v expected: %v", err, test.expectedErr) + } updatedNode, err := clientset.CoreV1().Nodes().Get(context.TODO(), test.existingNode.Name, metav1.GetOptions{}) if err != nil { @@ -1833,6 +1849,9 @@ func TestGCEConditionV2(t *testing.T) { InstanceTypes: map[types.NodeName]string{ types.NodeName("node0"): "t1.micro", }, + ProviderID: map[types.NodeName]string{ + types.NodeName("node0"): "fake://12334", + }, Addresses: []v1.NodeAddress{ { Type: v1.NodeHostName, @@ -2376,15 +2395,16 @@ func TestNodeAddressesNotUpdate(t *testing.T) { } } -func TestGetProviderID(t *testing.T) { +func TestGetInstanceMetadata(t *testing.T) { tests := []struct { - name string - fakeCloud *fakecloud.Cloud - existingNode *v1.Node - expectedProviderID string + name string + fakeCloud *fakecloud.Cloud + existingNode *v1.Node + expectedMetadata *cloudprovider.InstanceMetadata + expectErr bool }{ { - name: "node initialized with provider ID", + name: "cloud implemented with Instances and provider ID", fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: false, InstanceTypes: map[types.NodeName]string{ @@ -2393,6 +2413,9 @@ func TestGetProviderID(t *testing.T) { ExtID: map[types.NodeName]string{ types.NodeName("node0"): "12345", }, + ProviderID: map[types.NodeName]string{ + types.NodeName("node0"): "fake://12345", + }, Addresses: []v1.NodeAddress{ { Type: v1.NodeHostName, @@ -2423,21 +2446,27 @@ func TestGetProviderID(t *testing.T) { Effect: v1.TaintEffectNoSchedule, }, }, - ProviderID: "fake://12345", }, }, - expectedProviderID: "fake://12345", + expectedMetadata: &cloudprovider.InstanceMetadata{ + ProviderID: "fake://12345", + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, }, { - name: "cloud implemented with Instances (without providerID)", + name: "cloud implemented with Instances (providerID not implemented)", fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: false, InstanceTypes: map[types.NodeName]string{ types.NodeName("node0"): "t1.micro", types.NodeName("fake://12345"): "t1.micro", }, - ExtID: map[types.NodeName]string{ - types.NodeName("node0"): "12345", + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): cloudprovider.NotImplemented, }, Addresses: []v1.NodeAddress{ { @@ -2461,7 +2490,58 @@ func TestGetProviderID(t *testing.T) { CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), }, }, - expectedProviderID: "fake://12345", + expectedMetadata: &cloudprovider.InstanceMetadata{ + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, + }, + { + name: "cloud implemented with Instances (providerID not implemented) and node with providerID", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: false, + InstanceTypes: map[types.NodeName]string{ + types.NodeName("node0"): "t1.micro", + types.NodeName("fake://12345"): "t1.micro", + }, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): cloudprovider.NotImplemented, + }, + Addresses: []v1.NodeAddress{ + { + Type: v1.NodeHostName, + Address: "node0.cloud.internal", + }, + { + Type: v1.NodeInternalIP, + Address: "10.0.0.1", + }, + { + Type: v1.NodeExternalIP, + Address: "132.143.154.163", + }, + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "fake://asdasd", + }, + }, + expectedMetadata: &cloudprovider.InstanceMetadata{ + ProviderID: "fake://asdasd", + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, }, { name: "cloud implemented with InstancesV2 (with providerID)", @@ -2474,6 +2554,9 @@ func TestGetProviderID(t *testing.T) { ExtID: map[types.NodeName]string{ types.NodeName("node0"): "12345", }, + ProviderID: map[types.NodeName]string{ + types.NodeName("node0"): "fake://12345", + }, Addresses: []v1.NodeAddress{ { Type: v1.NodeHostName, @@ -2503,13 +2586,20 @@ func TestGetProviderID(t *testing.T) { Effect: v1.TaintEffectNoSchedule, }, }, - ProviderID: "fake://12345", }, }, - expectedProviderID: "fake://12345", + expectedMetadata: &cloudprovider.InstanceMetadata{ + ProviderID: "fake://12345", + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, }, - { - name: "cloud implemented with InstancesV2 (without providerID)", + { // it will be requeueud later + name: "cloud implemented with InstancesV2 (without providerID)", + expectErr: true, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: true, InstanceTypes: map[types.NodeName]string{ @@ -2550,7 +2640,59 @@ func TestGetProviderID(t *testing.T) { }, }, }, - expectedProviderID: "", + expectedMetadata: &cloudprovider.InstanceMetadata{ + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, + }, + { + name: "cloud implemented with InstancesV2 (without providerID) and node with providerID", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: true, + InstanceTypes: map[types.NodeName]string{ + types.NodeName("node0"): "t1.micro", + types.NodeName("fake://12345"): "t1.micro", + }, + ExtID: map[types.NodeName]string{ + types.NodeName("node0"): "12345", + }, + Addresses: []v1.NodeAddress{ + { + Type: v1.NodeHostName, + Address: "node0.cloud.internal", + }, + { + Type: v1.NodeInternalIP, + Address: "10.0.0.1", + }, + { + Type: v1.NodeExternalIP, + Address: "132.143.154.163", + }, + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "fake://12345", + }, + }, + expectedMetadata: &cloudprovider.InstanceMetadata{ + ProviderID: "fake://12345", + InstanceType: "t1.micro", + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, }, } @@ -2560,13 +2702,13 @@ func TestGetProviderID(t *testing.T) { cloud: test.fakeCloud, } - providerID, err := cloudNodeController.getProviderID(context.TODO(), test.existingNode) - if err != nil { - t.Fatalf("error getting provider ID: %v", err) + metadata, err := cloudNodeController.getInstanceMetadata(context.TODO(), test.existingNode) + if (err != nil) != test.expectErr { + t.Fatalf("error expected %v got: %v", test.expectErr, err) } - if !cmp.Equal(providerID, test.expectedProviderID) { - t.Errorf("unexpected providerID %s", cmp.Diff(providerID, test.expectedProviderID)) + if !cmp.Equal(metadata, test.expectedMetadata) { + t.Errorf("unexpected metadata %s", cmp.Diff(metadata, test.expectedMetadata)) } }) }