From c940ccd0c38b6dd1efbc31c0ec20d6d489dac765 Mon Sep 17 00:00:00 2001 From: Francois Eleouet Date: Fri, 17 Nov 2023 11:55:40 +0100 Subject: [PATCH] Use get instead of list to retrieve servers Under certain circumstances, when they are non-responsive cells, nova may return a partial result instead of an error. (this is controlled by list_records_by_skipping_down_cells parameter that is true by default - no error will be thrown) When it faces this issue, capo controller will try to create a new server for the machine, resulting in deleting ports of existing one. In order avoid this issue, use GET instead of LIST with server uuid stored in machine spec when it is created, and store server ID in machine spec immediately. For that purpose serviver polling that used to take place in CreateMachine has to be moved to main reconcile loop. --- controllers/openstackcluster_controller.go | 69 ++++++++++++------- .../openstackcluster_controller_test.go | 15 ++-- controllers/openstackmachine_controller.go | 23 +++++-- controllers/suite_test.go | 9 ++- pkg/cloud/services/compute/instance.go | 23 +------ pkg/cloud/services/compute/instance_test.go | 43 ------------ 6 files changed, 80 insertions(+), 102 deletions(-) diff --git a/controllers/openstackcluster_controller.go b/controllers/openstackcluster_controller.go index e6d3025cba..28bc420628 100644 --- a/controllers/openstackcluster_controller.go +++ b/controllers/openstackcluster_controller.go @@ -206,6 +206,9 @@ func contains(arr []string, target string) bool { } func deleteBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster) error { + if openStackCluster.Status.Bastion == nil || openStackCluster.Status.Bastion.ID == "" { + return nil + } computeService, err := compute.NewService(scope) if err != nil { return err @@ -215,8 +218,7 @@ func deleteBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackClust return err } - instanceName := fmt.Sprintf("%s-bastion", cluster.Name) - instanceStatus, err := computeService.GetInstanceStatusByName(openStackCluster, instanceName) + instanceStatus, err := computeService.GetInstanceStatus(openStackCluster.Status.Bastion.ID) if err != nil { return err } @@ -277,8 +279,8 @@ func reconcileNormal(scope scope.Scope, cluster *clusterv1.Cluster, openStackClu return reconcile.Result{}, err } - if err = reconcileBastion(scope, cluster, openStackCluster); err != nil { - return reconcile.Result{}, err + if result, err := reconcileBastion(scope, cluster, openStackCluster); err != nil { + return result, err } availabilityZones, err := computeService.GetAvailabilityZones() @@ -308,82 +310,99 @@ func reconcileNormal(scope scope.Scope, cluster *clusterv1.Cluster, openStackClu return reconcile.Result{}, nil } -func reconcileBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster) error { +func reconcileBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster) (ctrl.Result, error) { scope.Logger().Info("Reconciling Bastion") if openStackCluster.Spec.Bastion == nil || !openStackCluster.Spec.Bastion.Enabled { - return deleteBastion(scope, cluster, openStackCluster) + return reconcile.Result{}, deleteBastion(scope, cluster, openStackCluster) } computeService, err := compute.NewService(scope) if err != nil { - return err + return reconcile.Result{}, err } instanceSpec := bastionToInstanceSpec(openStackCluster, cluster.Name) bastionHash, err := compute.HashInstanceSpec(instanceSpec) if err != nil { - return fmt.Errorf("failed computing bastion hash from instance spec: %w", err) + return reconcile.Result{}, fmt.Errorf("failed computing bastion hash from instance spec: %w", err) } - instanceStatus, err := computeService.GetInstanceStatusByName(openStackCluster, fmt.Sprintf("%s-bastion", cluster.Name)) - if err != nil { - return err + var instanceStatus *compute.InstanceStatus + if openStackCluster.Status.Bastion != nil && openStackCluster.Status.Bastion.ID != "" { + if instanceStatus, err = computeService.GetInstanceStatus(openStackCluster.Status.Bastion.ID); err != nil { + return reconcile.Result{}, err + } } - if instanceStatus != nil { + if instanceStatus == nil { + instanceStatus, err = computeService.CreateInstance(openStackCluster, openStackCluster, instanceSpec, cluster.Name) + if err != nil { + return reconcile.Result{}, fmt.Errorf("failed to create bastion: %w", err) + } + openStackCluster.Status.Bastion = &infrav1.BastionStatus{ + ID: instanceStatus.ID(), + } + } else { if !bastionHashHasChanged(bastionHash, openStackCluster.ObjectMeta.Annotations) { bastion, err := instanceStatus.BastionStatus(openStackCluster) if err != nil { - return err + return reconcile.Result{}, err } // Add the current hash if no annotation is set. if _, ok := openStackCluster.ObjectMeta.Annotations[BastionInstanceHashAnnotation]; !ok { annotations.AddAnnotations(openStackCluster, map[string]string{BastionInstanceHashAnnotation: bastionHash}) } openStackCluster.Status.Bastion = bastion - return nil + return ctrl.Result{}, nil } if err := deleteBastion(scope, cluster, openStackCluster); err != nil { - return err + return ctrl.Result{}, err } } - - instanceStatus, err = computeService.CreateInstance(openStackCluster, openStackCluster, instanceSpec, cluster.Name) - if err != nil { - return fmt.Errorf("failed to reconcile bastion: %w", err) + // Make sure that bastion instance has a valid state + switch instanceStatus.State() { + case infrav1.InstanceStateError: + return ctrl.Result{}, fmt.Errorf("failed to reconcile bastion, instance state is ERROR") + case infrav1.InstanceStateBuilding: + // Requeue until bastion becomes active + return ctrl.Result{RequeueAfter: waitForBuildingInstanceToReconcile}, nil + case infrav1.InstanceStateDeleted: + // This should normally be handled by deleteBastion + openStackCluster.Status.Bastion = nil + return ctrl.Result{}, nil } networkingService, err := networking.NewService(scope) if err != nil { - return err + return ctrl.Result{}, err } clusterName := fmt.Sprintf("%s-%s", cluster.Namespace, cluster.Name) fp, err := networkingService.GetOrCreateFloatingIP(openStackCluster, openStackCluster, clusterName, openStackCluster.Spec.Bastion.FloatingIP) if err != nil { handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to get or create floating IP for bastion: %w", err)) - return fmt.Errorf("failed to get or create floating IP for bastion: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to get or create floating IP for bastion: %w", err) } port, err := computeService.GetManagementPort(openStackCluster, instanceStatus) if err != nil { err = fmt.Errorf("getting management port for bastion: %w", err) handleUpdateOSCError(openStackCluster, err) - return err + return ctrl.Result{}, err } err = networkingService.AssociateFloatingIP(openStackCluster, fp, port.ID) if err != nil { handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to associate floating IP with bastion: %w", err)) - return fmt.Errorf("failed to associate floating IP with bastion: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to associate floating IP with bastion: %w", err) } bastion, err := instanceStatus.BastionStatus(openStackCluster) if err != nil { - return err + return ctrl.Result{}, err } bastion.FloatingIP = fp.FloatingIP openStackCluster.Status.Bastion = bastion annotations.AddAnnotations(openStackCluster, map[string]string{BastionInstanceHashAnnotation: bastionHash}) - return nil + return ctrl.Result{}, nil } func bastionToInstanceSpec(openStackCluster *infrav1.OpenStackCluster, clusterName string) *compute.InstanceSpec { diff --git a/controllers/openstackcluster_controller_test.go b/controllers/openstackcluster_controller_test.go index a4f0cfb442..e8e1189f01 100644 --- a/controllers/openstackcluster_controller_test.go +++ b/controllers/openstackcluster_controller_test.go @@ -22,7 +22,7 @@ import ( "github.com/go-logr/logr" "github.com/golang/mock/gomock" - "github.com/gophercloud/gophercloud/openstack/compute/v2/servers" + "github.com/gophercloud/gophercloud" "github.com/gophercloud/gophercloud/openstack/networking/v2/extensions/security/groups" "github.com/gophercloud/gophercloud/openstack/networking/v2/networks" "github.com/gophercloud/gophercloud/openstack/networking/v2/subnets" @@ -38,7 +38,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" infrav1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1alpha7" - "sigs.k8s.io/cluster-api-provider-openstack/pkg/clients" "sigs.k8s.io/cluster-api-provider-openstack/pkg/scope" ) @@ -195,18 +194,24 @@ var _ = Describe("OpenStackCluster controller", func() { Expect(err).To(BeNil()) err = k8sClient.Create(ctx, capiCluster) Expect(err).To(BeNil()) + testCluster.Status = infrav1.OpenStackClusterStatus{ + Bastion: &infrav1.BastionStatus{ + ID: "bastion-uuid", + }, + } + err = k8sClient.Status().Update(ctx, testCluster) + Expect(err).To(BeNil()) scope, err := mockScopeFactory.NewClientScopeFromCluster(ctx, k8sClient, testCluster, nil, logr.Discard()) Expect(err).To(BeNil()) computeClientRecorder := mockScopeFactory.ComputeClient.EXPECT() - computeClientRecorder.ListServers(servers.ListOpts{ - Name: "^capi-cluster-bastion$", - }).Return([]clients.ServerExt{}, nil) + computeClientRecorder.GetServer("bastion-uuid").Return(nil, gophercloud.ErrResourceNotFound{}) networkClientRecorder := mockScopeFactory.NetworkClient.EXPECT() networkClientRecorder.ListSecGroup(gomock.Any()).Return([]groups.SecGroup{}, nil) err = deleteBastion(scope, capiCluster, testCluster) + Expect(testCluster.Status.Bastion).To(BeNil()) Expect(err).To(BeNil()) }) It("should implicitly filter cluster subnets by cluster network", func() { diff --git a/controllers/openstackmachine_controller.go b/controllers/openstackmachine_controller.go index aa8caa3d89..60bd897a7f 100644 --- a/controllers/openstackmachine_controller.go +++ b/controllers/openstackmachine_controller.go @@ -67,6 +67,7 @@ type OpenStackMachineReconciler struct { const ( waitForClusterInfrastructureReadyDuration = 15 * time.Second waitForInstanceBecomeActiveToReconcile = 60 * time.Second + waitForBuildingInstanceToReconcile = 10 * time.Second ) // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=openstackmachines,verbs=get;list;watch;create;update;patch;delete @@ -246,7 +247,10 @@ func (r *OpenStackMachineReconciler) reconcileDelete(scope scope.Scope, cluster } } - instanceStatus, err := computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name) + var instanceStatus *compute.InstanceStatus + if openStackMachine.Spec.InstanceID != nil { + instanceStatus, err = computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name) + } if err != nil { return ctrl.Result{}, err } @@ -379,6 +383,9 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope scope.Logger().Info("Machine instance state is DELETED, no actions") conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeletedReason, clusterv1.ConditionSeverityError, "") return ctrl.Result{}, nil + case infrav1.InstanceStateBuilding: + scope.Logger().Info("Waiting for instance to become ACTIVE", "id", instanceStatus.ID(), "status", instanceStatus.State()) + return ctrl.Result{RequeueAfter: waitForBuildingInstanceToReconcile}, nil default: // The other state is normal (for example, migrating, shutoff) but we don't want to proceed until it's ACTIVE // due to potential conflict or unexpected actions @@ -431,11 +438,15 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope } func (r *OpenStackMachineReconciler) getOrCreate(logger logr.Logger, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster, machine *clusterv1.Machine, openStackMachine *infrav1.OpenStackMachine, computeService *compute.Service, userData string) (*compute.InstanceStatus, error) { - instanceStatus, err := computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name) - if err != nil { - logger.Info("Unable to get OpenStack instance", "name", openStackMachine.Name) - conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.OpenStackErrorReason, clusterv1.ConditionSeverityError, err.Error()) - return nil, err + var instanceStatus *compute.InstanceStatus + var err error + if openStackMachine.Spec.InstanceID != nil { + instanceStatus, err = computeService.GetInstanceStatus(*openStackMachine.Spec.InstanceID) + if err != nil { + logger.Info("Unable to get OpenStack instance", "name", openStackMachine.Name) + conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.OpenStackErrorReason, clusterv1.ConditionSeverityError, err.Error()) + return nil, err + } } if instanceStatus == nil { diff --git a/controllers/suite_test.go b/controllers/suite_test.go index b0edc98d5f..aeb0f12a22 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -32,6 +32,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" + "k8s.io/utils/pointer" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/test/framework" "sigs.k8s.io/controller-runtime/pkg/client" @@ -148,9 +149,13 @@ var _ = Describe("When calling getOrCreate", func() { cluster := &clusterv1.Cluster{} openStackCluster := &infrav1.OpenStackCluster{} machine := &clusterv1.Machine{} - openStackMachine := &infrav1.OpenStackMachine{} + openStackMachine := &infrav1.OpenStackMachine{ + Spec: infrav1.OpenStackMachineSpec{ + InstanceID: pointer.String("machine-uuid"), + }, + } - mockScopeFactory.ComputeClient.EXPECT().ListServers(gomock.Any()).Return(nil, errors.New("Test error when listing servers")) + mockScopeFactory.ComputeClient.EXPECT().GetServer(gomock.Any()).Return(nil, errors.New("Test error when getting server")) instanceStatus, err := reconsiler.getOrCreate(logger, cluster, openStackCluster, machine, openStackMachine, computeService, "") Expect(err).To(HaveOccurred()) Expect(instanceStatus).To(BeNil()) diff --git a/pkg/cloud/services/compute/instance.go b/pkg/cloud/services/compute/instance.go index c6b70af87e..8c6ba7b00d 100644 --- a/pkg/cloud/services/compute/instance.go +++ b/pkg/cloud/services/compute/instance.go @@ -333,27 +333,8 @@ func (s *Service) createInstanceImpl(eventObject runtime.Object, openStackCluste return nil, fmt.Errorf("error creating Openstack instance: %v", err) } - var createdInstance *InstanceStatus - err = wait.PollUntilContextTimeout(context.TODO(), retryInterval, instanceCreateTimeout, true, func(_ context.Context) (bool, error) { - createdInstance, err = s.GetInstanceStatus(server.ID) - if err != nil { - if capoerrors.IsRetryable(err) { - return false, nil - } - return false, err - } - if createdInstance.State() == infrav1.InstanceStateError { - return false, fmt.Errorf("error creating OpenStack instance %s, status changed to error", createdInstance.ID()) - } - return createdInstance.State() == infrav1.InstanceStateActive, nil - }) - if err != nil { - record.Warnf(eventObject, "FailedCreateServer", "Failed to create server %s: %v", createdInstance.Name(), err) - return nil, err - } - - record.Eventf(eventObject, "SuccessfulCreateServer", "Created server %s with id %s", createdInstance.Name(), createdInstance.ID()) - return createdInstance, nil + record.Eventf(eventObject, "SuccessfulCreateServer", "Created server %s with id %s", server.Name, server.ID) + return &InstanceStatus{server, s.scope.Logger()}, nil } func volumeName(instanceName string, nameSuffix string) string { diff --git a/pkg/cloud/services/compute/instance_test.go b/pkg/cloud/services/compute/instance_test.go index cafc9e3edc..a0f4af41fb 100644 --- a/pkg/cloud/services/compute/instance_test.go +++ b/pkg/cloud/services/compute/instance_test.go @@ -406,17 +406,6 @@ func TestService_ReconcileInstance(t *testing.T) { }) } - // Expected calls when polling for server creation - expectServerPoll := func(computeRecorder *mock.MockComputeClientMockRecorder, states []string) { - for _, state := range states { - computeRecorder.GetServer(instanceUUID).Return(returnedServer(state), nil) - } - } - - expectServerPollSuccess := func(computeRecorder *mock.MockComputeClientMockRecorder) { - expectServerPoll(computeRecorder, []string{"ACTIVE"}) - } - returnedVolume := func(uuid string, status string) *volumes.Volume { return &volumes.Volume{ ID: uuid, @@ -460,7 +449,6 @@ func TestService_ReconcileInstance(t *testing.T) { expectDefaultImageAndFlavor(r.compute, r.image) expectCreateServer(r.compute, getDefaultServerMap(), false) - expectServerPollSuccess(r.compute) }, wantErr: false, }, @@ -503,32 +491,6 @@ func TestService_ReconcileInstance(t *testing.T) { }, wantErr: true, }, - { - name: "Poll until server is created", - getInstanceSpec: getDefaultInstanceSpec, - expect: func(r *recorders) { - expectUseExistingDefaultPort(r.network) - expectDefaultImageAndFlavor(r.compute, r.image) - - expectCreateServer(r.compute, getDefaultServerMap(), false) - expectServerPoll(r.compute, []string{"BUILDING", "ACTIVE"}) - }, - wantErr: false, - }, - { - name: "Server errors during creation", - getInstanceSpec: getDefaultInstanceSpec, - expect: func(r *recorders) { - expectUseExistingDefaultPort(r.network) - expectDefaultImageAndFlavor(r.compute, r.image) - - expectCreateServer(r.compute, getDefaultServerMap(), false) - expectServerPoll(r.compute, []string{"BUILDING", "ERROR"}) - - // Don't delete ports because the server is created: DeleteInstance will do it - }, - wantErr: true, - }, { name: "Boot from volume success", getInstanceSpec: func() *InstanceSpec { @@ -567,7 +529,6 @@ func TestService_ReconcileInstance(t *testing.T) { }, } expectCreateServer(r.compute, createMap, false) - expectServerPollSuccess(r.compute) // Don't delete ports because the server is created: DeleteInstance will do it }, @@ -614,7 +575,6 @@ func TestService_ReconcileInstance(t *testing.T) { }, } expectCreateServer(r.compute, createMap, false) - expectServerPollSuccess(r.compute) // Don't delete ports because the server is created: DeleteInstance will do it }, @@ -734,7 +694,6 @@ func TestService_ReconcileInstance(t *testing.T) { }, } expectCreateServer(r.compute, createMap, false) - expectServerPollSuccess(r.compute) // Don't delete ports because the server is created: DeleteInstance will do it }, @@ -809,7 +768,6 @@ func TestService_ReconcileInstance(t *testing.T) { }, } expectCreateServer(r.compute, createMap, false) - expectServerPollSuccess(r.compute) // Don't delete ports because the server is created: DeleteInstance will do it }, @@ -870,7 +828,6 @@ func TestService_ReconcileInstance(t *testing.T) { }, } expectCreateServer(r.compute, createMap, false) - expectServerPollSuccess(r.compute) // Don't delete ports because the server is created: DeleteInstance will do it },