Skip to content

Commit

Permalink
Don't try to resolve machine on delete if cluster not ready
Browse files Browse the repository at this point in the history
This fixes a bug where if we created a machine for a cluster which never
became ready, we would never be able to 'resolve' the machine and
therefore never delete it. We address this situation in several layers:

Firstly, we move the point in the machine controller at which we add the
finalizer in the first place. We don't add the finalizer until we're
writing resolved, so this situation can never occur for newly created
machines. This makes sense because we don't create resources until we've
observed that both the finalizer has been added and resolved is up to
date, so we don't need the finalizer to protect resources which can't
have been created yet.

Secondly, we shortcut the delete flow if the cluster is not ready. This
is safe for the same reason as above, but is only relevant to machines
created before v0.10.

Lastly we surface and restrict the circumstances in which 'Resolved' is
required on delete anyway. On closer inspection, this is only required
in the very specific circumstance that the machine has volumes defined,
and we are deleting it without the machine having been created. To make
this more obvious we split volume deletion out of DeleteInstance and
only resolve the machine spec in the event that it's required.

2 other factors make this change larger than it might otherwise be.

We hit a cyclomatic complexity limit in reconcileDelete(), requiring a
refactor.

We remove the DeleteInstance tests which, after separating out
DeleteVolumes, are quite trivial, and replace them with much more
comprehensive set of tests for reconcileDelete.
  • Loading branch information
mdbooth committed Apr 9, 2024
1 parent 6c83bc4 commit 9ebb706
Show file tree
Hide file tree
Showing 7 changed files with 576 additions and 227 deletions.
19 changes: 13 additions & 6 deletions controllers/openstackcluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,18 @@ func deleteBastion(scope *scope.WithLogger, cluster *clusterv1.Cluster, openStac
}
}

if instanceStatus != nil {
// If no instance was created we currently need to check for orphaned
// volumes. This requires resolving the instance spec.
// TODO: write volumes to status resources on creation so this is no longer required.
if instanceStatus == nil {
instanceSpec, err := bastionToInstanceSpec(openStackCluster, cluster)
if err != nil {
return err
}
if err := computeService.DeleteVolumes(instanceSpec); err != nil {
return fmt.Errorf("delete volumes: %w", err)
}
} else {
instanceNS, err := instanceStatus.NetworkStatus()
if err != nil {
return err
Expand All @@ -307,11 +318,7 @@ func deleteBastion(scope *scope.WithLogger, cluster *clusterv1.Cluster, openStac
}
}

instanceSpec, err := bastionToInstanceSpec(openStackCluster, cluster)
if err != nil {
return err
}
if err = computeService.DeleteInstance(openStackCluster, instanceStatus, instanceSpec); err != nil {
if err = computeService.DeleteInstance(openStackCluster, instanceStatus); err != nil {
handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to delete bastion: %w", err))
return fmt.Errorf("failed to delete bastion: %w", err)
}
Expand Down
5 changes: 4 additions & 1 deletion controllers/openstackcluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@ var _ = Describe("OpenStackCluster controller", func() {
testCluster.Status = infrav1.OpenStackClusterStatus{
Bastion: &infrav1.BastionStatus{
ID: "bastion-uuid",
Resolved: &infrav1.ResolvedMachineSpec{
ImageID: "imageID",
},
},
}
err = k8sClient.Status().Update(ctx, testCluster)
Expand All @@ -221,8 +224,8 @@ var _ = Describe("OpenStackCluster controller", func() {
computeClientRecorder.GetServer("bastion-uuid").Return(nil, gophercloud.ErrResourceNotFound{})

err = deleteBastion(scope, capiCluster, testCluster)
Expect(testCluster.Status.Bastion).To(BeNil())
Expect(err).To(BeNil())
Expect(testCluster.Status.Bastion).To(BeNil())
})
It("should adopt an existing bastion even if its uuid is not stored in status", func() {
testCluster.SetName("adopt-existing-bastion")
Expand Down
209 changes: 141 additions & 68 deletions controllers/openstackmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,26 +169,21 @@ func resolveMachineResources(scope *scope.WithLogger, clusterResourceName string
openStackMachine.Status.Resolved = resolved
}
// Resolve and store resources
changed, err := compute.ResolveMachineSpec(scope,
return compute.ResolveMachineSpec(scope,
&openStackMachine.Spec, resolved,
clusterResourceName, openStackMachine.Name,
openStackCluster, getManagedSecurityGroup(openStackCluster, machine))
if err != nil {
return false, err
}
if changed {
// If the resolved machine spec changed we need to start the reconcile again to prevent inconsistency between reconciles.
return true, nil
}
}

func adoptMachineResources(scope *scope.WithLogger, openStackMachine *infrav1.OpenStackMachine) error {
resources := openStackMachine.Status.Resources
if resources == nil {
resources = &infrav1.MachineResources{}
openStackMachine.Status.Resources = resources
}

// Adopt any existing resources
return false, compute.AdoptMachineResources(scope, resolved, resources)
return compute.AdoptMachineResources(scope, openStackMachine.Status.Resolved, resources)
}

func patchMachine(ctx context.Context, patchHelper *patch.Helper, openStackMachine *infrav1.OpenStackMachine, machine *clusterv1.Machine, options ...patch.Option) error {
Expand Down Expand Up @@ -256,66 +251,72 @@ func (r *OpenStackMachineReconciler) reconcileDelete(scope *scope.WithLogger, cl
return ctrl.Result{}, err
}

// We may have resources to adopt if the cluster is ready
if openStackCluster.Status.Ready && openStackCluster.Status.Network != nil {
if _, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine); err != nil {
return ctrl.Result{}, err
}
// Nothing to do if the cluster is not ready because no machine resources were created.
if !openStackCluster.Status.Ready || openStackCluster.Status.Network == nil {
// The finalizer should not have been added yet in this case,
// but the following handles the upgrade case.
controllerutil.RemoveFinalizer(openStackMachine, infrav1.MachineFinalizer)
return ctrl.Result{}, nil
}

if openStackCluster.Spec.APIServerLoadBalancer.IsEnabled() {
loadBalancerService, err := loadbalancer.NewService(scope)
if err != nil {
return ctrl.Result{}, err
}
// For machines created after v0.10, or any machine which has been
// reconciled at least once by v0.10 or later, status.Resolved always
// exists before any resources are created. We can therefore assume
// that if it does not exist, no resources were created.
//
// There is an upgrade edge case where a machine may have been marked
// deleted before upgrade but we are completing it after upgrade. For
// this use case only we make a best effort to resolve resources before
// continuing, but if we get an error we log it and continue anyway.
// This has the potential to leak resources, but only in this specific
// edge case. The alternative is to continue retrying until it succeeds,
// but that risks never deleting a machine which cannot be resolved due
// to a spec error.
//
// This code can and should be deleted in a future release when we are
// sure that all machines have been reconciled at least by a v0.10 or
// later controller.
if _, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine); err != nil {
// Return the error, but allow the resource to be removed anyway.
controllerutil.RemoveFinalizer(openStackMachine, infrav1.MachineFinalizer)
return ctrl.Result{}, err
}

err = loadBalancerService.DeleteLoadBalancerMember(openStackCluster, machine, openStackMachine, clusterResourceName)
if err != nil {
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.LoadBalancerMemberErrorReason, clusterv1.ConditionSeverityWarning, "Machine could not be removed from load balancer: %v", err)
return ctrl.Result{}, err
}
// Check for any orphaned resources
// N.B. Unlike resolveMachineResources, we must always look for orphaned resources in the delete path.
if err := adoptMachineResources(scope, openStackMachine); err != nil {
return ctrl.Result{}, fmt.Errorf("adopting machine resources: %w", err)
}

var instanceStatus *compute.InstanceStatus
if openStackMachine.Status.InstanceID != nil {
instanceStatus, err = computeService.GetInstanceStatus(*openStackMachine.Status.InstanceID)
if err != nil {
return ctrl.Result{}, err
}
} else if instanceStatus, err = computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name); err != nil {
instanceStatus, err := getInstanceStatus(openStackMachine, computeService)
if err != nil {
return ctrl.Result{}, err
}
if !openStackCluster.Spec.APIServerLoadBalancer.IsEnabled() && util.IsControlPlaneMachine(machine) && openStackCluster.Spec.APIServerFloatingIP == nil {
if instanceStatus != nil {
instanceNS, err := instanceStatus.NetworkStatus()
if err != nil {
openStackMachine.SetFailure(
capierrors.UpdateMachineError,
fmt.Errorf("get network status for OpenStack instance %s with ID %s: %v", instanceStatus.Name(), instanceStatus.ID(), err),
)
return ctrl.Result{}, nil
}

addresses := instanceNS.Addresses()
for _, address := range addresses {
if address.Type == corev1.NodeExternalIP {
if err = networkingService.DeleteFloatingIP(openStackMachine, address.Address); err != nil {
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.FloatingIPErrorReason, clusterv1.ConditionSeverityError, "Deleting floating IP failed: %v", err)
return ctrl.Result{}, fmt.Errorf("delete floating IP %q: %w", address.Address, err)
}
}
}
if util.IsControlPlaneMachine(machine) {
if err := removeAPIServerEndpoint(scope, openStackCluster, openStackMachine, instanceStatus, clusterResourceName); err != nil {
return ctrl.Result{}, err
}
}

instanceSpec, err := machineToInstanceSpec(openStackCluster, machine, openStackMachine, "")
if err != nil {
return ctrl.Result{}, err
// If no instance was created we currently need to check for orphaned
// volumes. This requires resolving the instance spec.
// TODO: write volumes to status resources on creation so this is no longer required.
if instanceStatus == nil && openStackMachine.Status.Resolved != nil {
instanceSpec, err := machineToInstanceSpec(openStackCluster, machine, openStackMachine, "")
if err != nil {
return ctrl.Result{}, err
}
if err := computeService.DeleteVolumes(instanceSpec); err != nil {
return ctrl.Result{}, fmt.Errorf("delete volumes: %w", err)
}
}

if err := computeService.DeleteInstance(openStackMachine, instanceStatus, instanceSpec); err != nil {
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeleteFailedReason, clusterv1.ConditionSeverityError, "Deleting instance failed: %v", err)
return ctrl.Result{}, fmt.Errorf("delete instance: %w", err)
if instanceStatus != nil {
if err := computeService.DeleteInstance(openStackMachine, instanceStatus); err != nil {
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeleteFailedReason, clusterv1.ConditionSeverityError, "Deleting instance failed: %v", err)
return ctrl.Result{}, fmt.Errorf("delete instance: %w", err)
}
}

trunkSupported, err := networkingService.IsTrunkExtSupported()
Expand All @@ -341,6 +342,62 @@ func (r *OpenStackMachineReconciler) reconcileDelete(scope *scope.WithLogger, cl
return ctrl.Result{}, nil
}

func getInstanceStatus(openStackMachine *infrav1.OpenStackMachine, computeService *compute.Service) (*compute.InstanceStatus, error) {
if openStackMachine.Status.InstanceID != nil {
return computeService.GetInstanceStatus(*openStackMachine.Status.InstanceID)
}
return computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name)
}

func removeAPIServerEndpoint(scope *scope.WithLogger, openStackCluster *infrav1.OpenStackCluster, openStackMachine *infrav1.OpenStackMachine, instanceStatus *compute.InstanceStatus, clusterResourceName string) error {
if openStackCluster.Spec.APIServerLoadBalancer.IsEnabled() {
loadBalancerService, err := loadbalancer.NewService(scope)
if err != nil {
return err
}

err = loadBalancerService.DeleteLoadBalancerMember(openStackCluster, openStackMachine, clusterResourceName)
if err != nil {
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.LoadBalancerMemberErrorReason, clusterv1.ConditionSeverityWarning, "Machine could not be removed from load balancer: %v", err)
return err
}
return nil
}

// XXX(mdbooth): This looks wrong to me. Surely we should only ever
// disassociate the floating IP here. I would expect the API server
// floating IP to be created and deleted with the cluster. And if the
// delete behaviour is correct, we leak it if the instance was
// previously deleted.
if openStackCluster.Spec.APIServerFloatingIP == nil && instanceStatus != nil {
instanceNS, err := instanceStatus.NetworkStatus()
if err != nil {
openStackMachine.SetFailure(
capierrors.UpdateMachineError,
fmt.Errorf("get network status for OpenStack instance %s with ID %s: %v", instanceStatus.Name(), instanceStatus.ID(), err),
)
return nil
}

networkingService, err := networking.NewService(scope)
if err != nil {
return err
}

addresses := instanceNS.Addresses()
for _, address := range addresses {
if address.Type == corev1.NodeExternalIP {
if err = networkingService.DeleteFloatingIP(openStackMachine, address.Address); err != nil {
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.FloatingIPErrorReason, clusterv1.ConditionSeverityError, "Deleting floating IP failed: %v", err)
return fmt.Errorf("delete floating IP %q: %w", address.Address, err)
}
}
}
}

return nil
}

// GetPortIDs returns a list of port IDs from a list of PortStatus.
func GetPortIDs(ports []infrav1.PortStatus) []string {
portIDs := make([]string, len(ports))
Expand Down Expand Up @@ -489,34 +546,50 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
return ctrl.Result{}, nil
}

// If the OpenStackMachine doesn't have our finalizer, add it.
if controllerutil.AddFinalizer(openStackMachine, infrav1.MachineFinalizer) {
// Register the finalizer immediately to avoid orphaning OpenStack resources on delete
return ctrl.Result{}, nil
}

if !openStackCluster.Status.Ready {
scope.Logger().Info("Cluster infrastructure is not ready yet, re-queuing machine")
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
return ctrl.Result{RequeueAfter: waitForClusterInfrastructureReadyDuration}, nil
}

if changed, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine); changed || err != nil {
scope.Logger().V(6).Info("Machine resources updated, requeuing")
return ctrl.Result{}, err
}

// Make sure bootstrap data is available and populated.
if machine.Spec.Bootstrap.DataSecretName == nil {
scope.Logger().Info("Bootstrap data secret reference is not yet available")
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.WaitingForBootstrapDataReason, clusterv1.ConditionSeverityInfo, "")
return ctrl.Result{}, nil
}
userData, err := r.getBootstrapData(ctx, machine, openStackMachine)

changed, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine)
if err != nil {
return ctrl.Result{}, err
}

// Also add the finalizer when writing resolved resources so we can start creating resources on the next reconcile.
if controllerutil.AddFinalizer(openStackMachine, infrav1.MachineFinalizer) {
changed = true
}

// We requeue if we either added the finalizer or resolved machine
// resources. This means that we never create any resources unless we
// have observed that the finalizer and resolved machine resources were
// successfully written in a previous transaction. This in turn means
// that in the delete path we can be sure that if there are no resolved
// resources then no resources were created.
if changed {
scope.Logger().V(6).Info("Machine resources updated, requeuing")
return ctrl.Result{}, nil
}

// Check for orphaned resources previously created but not written to the status
if err := adoptMachineResources(scope, openStackMachine); err != nil {
return ctrl.Result{}, fmt.Errorf("adopting machine resources: %w", err)
}

scope.Logger().Info("Reconciling Machine")
userData, err := r.getBootstrapData(ctx, machine, openStackMachine)
if err != nil {
return ctrl.Result{}, err
}

computeService, err := compute.NewService(scope)
if err != nil {
Expand Down
Loading

0 comments on commit 9ebb706

Please sign in to comment.