From 4331f3da5c6f42f3252ddb615206730552053df4 Mon Sep 17 00:00:00 2001 From: Manuel Buil Date: Fri, 24 May 2024 11:27:10 +0200 Subject: [PATCH] Try several times to contact kube-api before failing Signed-off-by: Manuel Buil --- pkg/backend/hostgw/hostgw_windows.go | 19 ++++++++-------- pkg/backend/vxlan/device_windows.go | 23 ++++++++++--------- pkg/backend/vxlan/vxlan_windows.go | 4 ++-- pkg/subnet/kube/kube.go | 34 ++++++++++++++++++++-------- 4 files changed, 47 insertions(+), 33 deletions(-) diff --git a/pkg/backend/hostgw/hostgw_windows.go b/pkg/backend/hostgw/hostgw_windows.go index fe8951076f..8b665709e0 100644 --- a/pkg/backend/hostgw/hostgw_windows.go +++ b/pkg/backend/hostgw/hostgw_windows.go @@ -161,11 +161,11 @@ func (be *HostgwBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup // Wait for the network to populate Management IP log.Infof("Waiting to get ManagementIP from HNSNetwork %s", networkName) var newNetworkID = newNetwork.Id - waitErr = wait.Poll(500*time.Millisecond, 30*time.Second, func() (done bool, err error) { + waitErr := wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) { newNetwork, lastErr = hcsshim.HNSNetworkRequest("GET", newNetworkID, "") return newNetwork != nil && len(newNetwork.ManagementIP) != 0, nil }) - if waitErr == wait.ErrWaitTimeout { + if waitErr != nil { // Do not swallow the root cause if lastErr != nil { waitErr = lastErr @@ -179,12 +179,11 @@ func (be *HostgwBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup if err != nil { return nil, errors.Wrapf(err, "Failed to parse management ip (%s)", newNetwork.ManagementIP) } - - waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) { + waitErr = wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) { _, lastErr = ip.GetInterfaceByIP(managementIP.ToIP()) return lastErr == nil, nil }) - if waitErr == wait.ErrWaitTimeout { + if waitErr != nil { return nil, errors.Wrapf(lastErr, "timeout, failed to get net interface for HNSNetwork %s (%s)", networkName, newNetwork.ManagementIP) } @@ -227,19 +226,19 @@ func (be *HostgwBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup // Wait for the bridgeEndpoint to attach to the host log.Infof("Waiting to attach bridge endpoint %s to host", bridgeEndpointName) - waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) { + waitErr = wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) { lastErr = expectedBridgeEndpoint.HostAttach(1) if lastErr == nil { - return true, nil + return false, nil } // See https://github.com/flannel-io/flannel/issues/1391 and // hcsshim lacks some validations to detect the error, so we judge it by error message. if strings.Contains(lastErr.Error(), "This endpoint is already attached to the switch.") { - return true, nil + return false, nil } - return false, nil + return true, nil }) - if waitErr == wait.ErrWaitTimeout { + if waitErr != nil { return nil, errors.Wrapf(lastErr, "failed to hot attach bridge HNSEndpoint %s to host compartment", bridgeEndpointName) } log.Infof("Attached bridge endpoint %s to host successfully", bridgeEndpointName) diff --git a/pkg/backend/vxlan/device_windows.go b/pkg/backend/vxlan/device_windows.go index 08d6b29006..3708880462 100644 --- a/pkg/backend/vxlan/device_windows.go +++ b/pkg/backend/vxlan/device_windows.go @@ -17,6 +17,7 @@ package vxlan import ( + "context" "encoding/json" "fmt" "time" @@ -46,7 +47,7 @@ type NetAdapterNameSettings struct { NetworkAdapterName string `json:"NetworkAdapterName"` } -func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) { +func newVXLANDevice(ctx context.Context, devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) { subnet := createSubnet(devAttrs.addressPrefix.String(), (devAttrs.addressPrefix.IP + 1).String(), "0.0.0.0/0") network := &hcn.HostComputeNetwork{ Type: "Overlay", @@ -90,7 +91,7 @@ func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) { addNetAdapterName(network, devAttrs.interfaceName) } - hnsNetwork, err := ensureNetwork(network, devAttrs.addressPrefix.String()) + hnsNetwork, err := ensureNetwork(ctx, network, devAttrs.addressPrefix.String()) if err != nil { return nil, err } @@ -100,7 +101,7 @@ func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) { }, nil } -func ensureNetwork(expectedNetwork *hcn.HostComputeNetwork, expectedAddressPrefix string) (*hcn.HostComputeNetwork, error) { +func ensureNetwork(ctx context.Context, expectedNetwork *hcn.HostComputeNetwork, expectedAddressPrefix string) (*hcn.HostComputeNetwork, error) { createNetwork := true networkName := expectedNetwork.Name @@ -134,11 +135,11 @@ func ensureNetwork(expectedNetwork *hcn.HostComputeNetwork, expectedAddressPrefi // Wait for the network to populate Management IP log.Infof("Waiting to get ManagementIP from HostComputeNetwork %s", networkName) var newNetworkID = newNetwork.Id - waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) { + waitErr = wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) { newNetwork, lastErr = hcn.GetNetworkByID(newNetworkID) return newNetwork != nil && len(getManagementIP(newNetwork)) != 0, nil }) - if waitErr == wait.ErrWaitTimeout { + if waitErr != nil { // Do not swallow the root cause if lastErr != nil { waitErr = lastErr @@ -146,7 +147,7 @@ func ensureNetwork(expectedNetwork *hcn.HostComputeNetwork, expectedAddressPrefi return nil, errors.Wrapf(lastErr, "timeout, failed to get management IP from HostComputeNetwork %s", networkName) } - err = checkHostNetworkReady(newNetwork) + err = checkHostNetworkReady(ctx, newNetwork) if err != nil { return nil, errors.Wrapf(err, "Interface bound to %s took too long to get ready. Please check your network host configuration", networkName) } @@ -227,7 +228,7 @@ func addNetAdapterName(network *hcn.HostComputeNetwork, netAdapterName string) e } // checkHostNetworkReady waits for the host network to be ready: the main interface must be up and have an IP address -func checkHostNetworkReady(network *hcn.HostComputeNetwork) error { +func checkHostNetworkReady(ctx context.Context, network *hcn.HostComputeNetwork) error { managementIP := getManagementIP(network) // Wait for the interface with the management IP log.Infof("Waiting to get net interface for HostComputeNetwork %s (%s)", network.Name, managementIP) @@ -236,16 +237,16 @@ func checkHostNetworkReady(network *hcn.HostComputeNetwork) error { return errors.Wrapf(err, "Failed to parse management ip (%s)", managementIP) } - waitErr := wait.Poll(3*time.Second, 25*time.Second, func() (done bool, err error) { + waitErr := wait.PollUntilContextTimeout(ctx, 3*time.Second, 25*time.Second, true, func(context.Context) (done bool, err error) { iface, lastErr := ip.GetInterfaceByIP(managementIPv4.ToIP()) if lastErr == nil { log.V(2).Infof("Host interface: %s bound by %s ready", iface.Name, network.Name) - return true, nil + return false, nil } log.V(2).Infof("Host interface bound by %s not ready", network.Name) - return false, nil + return true, nil }) - if waitErr == wait.ErrWaitTimeout { + if waitErr != nil { return errors.Wrapf(waitErr, "timeout, failed to get net interface for HostComputeNetwork %s (%s)", network.Name, managementIP) } return nil diff --git a/pkg/backend/vxlan/vxlan_windows.go b/pkg/backend/vxlan/vxlan_windows.go index 81cbdb5fe7..e0720aec9e 100644 --- a/pkg/backend/vxlan/vxlan_windows.go +++ b/pkg/backend/vxlan/vxlan_windows.go @@ -151,7 +151,7 @@ func (be *VXLANBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup, interfaceName: be.extIface.Iface.Name, } - dev, err := newVXLANDevice(&devAttrs) + dev, err := newVXLANDevice(ctx, &devAttrs) if err != nil { return nil, fmt.Errorf("failed to create VXLAN network: %w", err) } @@ -191,7 +191,7 @@ func (be *VXLANBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup, } // Before contacting the lease server (e.g. kube-api), we verify that the physical interface is ready - err = checkHostNetworkReady(hcnNetwork) + err = checkHostNetworkReady(ctx, hcnNetwork) if err != nil { return nil, fmt.Errorf("interface bound to %s took too long to get ready. Please check your network host configuration", hcnNetwork.Name) } diff --git a/pkg/subnet/kube/kube.go b/pkg/subnet/kube/kube.go index f1dbcda6e5..9d87d66ac2 100644 --- a/pkg/subnet/kube/kube.go +++ b/pkg/subnet/kube/kube.go @@ -283,13 +283,20 @@ func (ksm *kubeSubnetManager) GetNetworkConfig(ctx context.Context) (*subnet.Con func (ksm *kubeSubnetManager) AcquireLease(ctx context.Context, attrs *lease.LeaseAttrs) (*lease.Lease, error) { var cachedNode *v1.Node var err error - if ksm.disableNodeInformer { - cachedNode, err = ksm.client.CoreV1().Nodes().Get(ctx, ksm.nodeName, metav1.GetOptions{ResourceVersion: "0"}) - } else { - cachedNode, err = ksm.nodeStore.Get(ksm.nodeName) - } - if err != nil { - return nil, err + waitErr := wait.PollUntilContextTimeout(ctx, 3*time.Second, 30*time.Second, true, func(context.Context) (done bool, err error) { + if ksm.disableNodeInformer { + cachedNode, err = ksm.client.CoreV1().Nodes().Get(ctx, ksm.nodeName, metav1.GetOptions{ResourceVersion: "0"}) + } else { + cachedNode, err = ksm.nodeStore.Get(ksm.nodeName) + } + if err != nil { + log.V(2).Infof("Failed to get node %q: %v", ksm.nodeName, err) + return false, nil + } + return true, nil + }) + if waitErr != nil { + return nil, fmt.Errorf("timeout contacting kube-api, failed to patch node %q. Error: %v", ksm.nodeName, waitErr) } n := cachedNode.DeepCopy() @@ -397,9 +404,16 @@ func (ksm *kubeSubnetManager) AcquireLease(ctx context.Context, attrs *lease.Lea return nil, fmt.Errorf("failed to create patch for node %q: %v", ksm.nodeName, err) } - _, err = ksm.client.CoreV1().Nodes().Patch(ctx, ksm.nodeName, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status") - if err != nil { - return nil, err + waitErr := wait.PollUntilContextTimeout(ctx, 3*time.Second, 30*time.Second, true, func(context.Context) (done bool, err error) { + _, err = ksm.client.CoreV1().Nodes().Patch(ctx, ksm.nodeName, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status") + if err != nil { + log.V(2).Infof("Failed to patch node %q: %v", ksm.nodeName, err) + return false, nil + } + return true, nil + }) + if waitErr != nil { + return nil, fmt.Errorf("timeout contacting kube-api, failed to patch node %q. Error: %v", ksm.nodeName, waitErr) } }