From 7ef114bb9b11e089230fe33446aed7e0ad7c991a Mon Sep 17 00:00:00 2001 From: Emilien Macchi Date: Tue, 11 Jun 2024 15:09:49 -0400 Subject: [PATCH] Wait and requeue if LB is in PENDING_DELETE If the LB that is being deleted when a cluster is deleted, it'll go through the PENDING_DELETE state and at this stage there is nothing we can do but wait for the LB to be actually deleted. If the LB is in that state during the deletion, let's just return no error but request a reconcile after some time. --- controllers/openstackcluster_controller.go | 9 ++++- .../services/loadbalancer/loadbalancer.go | 40 +++++++++++++------ 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/controllers/openstackcluster_controller.go b/controllers/openstackcluster_controller.go index 725cfcc319..c396b4b4c8 100644 --- a/controllers/openstackcluster_controller.go +++ b/controllers/openstackcluster_controller.go @@ -59,7 +59,8 @@ import ( ) const ( - waitForBastionToReconcile = 15 * time.Second + waitForBastionToReconcile = 15 * time.Second + waitForOctaviaPortsCleanup = 15 * time.Second ) // OpenStackClusterReconciler reconciles a OpenStackCluster object. @@ -183,10 +184,14 @@ func (r *OpenStackClusterReconciler) reconcileDelete(ctx context.Context, scope return reconcile.Result{}, err } - if err = loadBalancerService.DeleteLoadBalancer(openStackCluster, clusterResourceName); err != nil { + result, err := loadBalancerService.DeleteLoadBalancer(openStackCluster, clusterResourceName) + if err != nil { handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to delete load balancer: %w", err), false) return reconcile.Result{}, fmt.Errorf("failed to delete load balancer: %w", err) } + if result != nil { + return *result, nil + } } // if ManagedSubnets was not set, no network was created. diff --git a/pkg/cloud/services/loadbalancer/loadbalancer.go b/pkg/cloud/services/loadbalancer/loadbalancer.go index fbee602826..f29d633777 100644 --- a/pkg/cloud/services/loadbalancer/loadbalancer.go +++ b/pkg/cloud/services/loadbalancer/loadbalancer.go @@ -31,6 +31,7 @@ import ( "k8s.io/apimachinery/pkg/util/wait" utilsnet "k8s.io/utils/net" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" infrav1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1beta1" "sigs.k8s.io/cluster-api-provider-openstack/pkg/record" @@ -41,12 +42,16 @@ import ( ) const ( - networkPrefix string = "k8s-clusterapi" - kubeapiLBSuffix string = "kubeapi" - resolvedMsg string = "ControlPlaneEndpoint.Host is not an IP address, using the first resolved IP address" + networkPrefix string = "k8s-clusterapi" + kubeapiLBSuffix string = "kubeapi" + resolvedMsg string = "ControlPlaneEndpoint.Host is not an IP address, using the first resolved IP address" + waitForOctaviaLBCleanup = 15 * time.Second ) -const loadBalancerProvisioningStatusActive = "ACTIVE" +const ( + loadBalancerProvisioningStatusActive = "ACTIVE" + loadBalancerProvisioningStatusPendingDelete = "PENDING_DELETE" +) // We wrap the LookupHost function in a variable to allow overriding it in unit tests. // @@ -658,32 +663,40 @@ func (s *Service) ReconcileLoadBalancerMember(openStackCluster *infrav1.OpenStac return nil } -func (s *Service) DeleteLoadBalancer(openStackCluster *infrav1.OpenStackCluster, clusterResourceName string) error { +func (s *Service) DeleteLoadBalancer(openStackCluster *infrav1.OpenStackCluster, clusterResourceName string) (result *ctrl.Result, reterr error) { loadBalancerName := getLoadBalancerName(clusterResourceName) lb, err := s.checkIfLbExists(loadBalancerName) if err != nil { - return err + return nil, err } if lb == nil { - return nil + return nil, nil + } + + // If the load balancer is already in PENDING_DELETE state, we don't need to do anything. + // However we should still wait for the load balancer to be deleted which is why we + // request a new reconcile after a certain amount of time. + if lb.ProvisioningStatus == loadBalancerProvisioningStatusPendingDelete { + s.scope.Logger().Info("Load balancer is already in PENDING_DELETE state", "name", loadBalancerName) + return &ctrl.Result{RequeueAfter: waitForOctaviaLBCleanup}, nil } if lb.VipPortID != "" { fip, err := s.networkingService.GetFloatingIPByPortID(lb.VipPortID) if err != nil { - return err + return nil, err } if fip != nil && fip.FloatingIP != "" { if err = s.networkingService.DisassociateFloatingIP(openStackCluster, fip.FloatingIP); err != nil { - return err + return nil, err } // If the floating is user-provider (BYO floating IP), don't delete it. if openStackCluster.Spec.APIServerFloatingIP == nil || *openStackCluster.Spec.APIServerFloatingIP != fip.FloatingIP { if err = s.networkingService.DeleteFloatingIP(openStackCluster, fip.FloatingIP); err != nil { - return err + return nil, err } } else { s.scope.Logger().Info("Skipping load balancer floating IP deletion as it's a user-provided resource", "name", loadBalancerName, "fip", fip.FloatingIP) @@ -698,11 +711,14 @@ func (s *Service) DeleteLoadBalancer(openStackCluster *infrav1.OpenStackCluster, err = s.loadbalancerClient.DeleteLoadBalancer(lb.ID, deleteOpts) if err != nil && !capoerrors.IsNotFound(err) { record.Warnf(openStackCluster, "FailedDeleteLoadBalancer", "Failed to delete load balancer %s with id %s: %v", lb.Name, lb.ID, err) - return err + return nil, err } record.Eventf(openStackCluster, "SuccessfulDeleteLoadBalancer", "Deleted load balancer %s with id %s", lb.Name, lb.ID) - return nil + + // If we have reached this point, that means that the load balancer wasn't initially deleted but the request to delete it didn't return an error. + // So we want to requeue until the load balancer and its associated ports are actually deleted. + return &ctrl.Result{RequeueAfter: waitForOctaviaLBCleanup}, nil } func (s *Service) DeleteLoadBalancerMember(openStackCluster *infrav1.OpenStackCluster, openStackMachine *infrav1.OpenStackMachine, clusterResourceName string) error {