Skip to content

Commit

Permalink
Use get instead of list to retrieve servers
Browse files Browse the repository at this point in the history
Under certain circumstances, when they are non-responsive cells, nova
may return a partial result instead of an error.

(this is controlled by list_records_by_skipping_down_cells parameter
that is true by default - no error will be thrown)

When it faces this issue, capo controller will try to create a new
server for the machine, resulting in deleting ports of existing one.

In order avoid this issue, use GET instead of LIST with server uuid stored
in machine spec when it is created, and store server ID in machine spec
immediately. For that purpose serviver polling that used to take place
in CreateMachine has to be moved to main reconcile loop.
  • Loading branch information
zioc committed Nov 24, 2023
1 parent 821a1a2 commit c940ccd
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 102 deletions.
69 changes: 44 additions & 25 deletions controllers/openstackcluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,9 @@ func contains(arr []string, target string) bool {
}

func deleteBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster) error {
if openStackCluster.Status.Bastion == nil || openStackCluster.Status.Bastion.ID == "" {
return nil
}
computeService, err := compute.NewService(scope)
if err != nil {
return err
Expand All @@ -215,8 +218,7 @@ func deleteBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackClust
return err
}

instanceName := fmt.Sprintf("%s-bastion", cluster.Name)
instanceStatus, err := computeService.GetInstanceStatusByName(openStackCluster, instanceName)
instanceStatus, err := computeService.GetInstanceStatus(openStackCluster.Status.Bastion.ID)
if err != nil {
return err
}
Expand Down Expand Up @@ -277,8 +279,8 @@ func reconcileNormal(scope scope.Scope, cluster *clusterv1.Cluster, openStackClu
return reconcile.Result{}, err
}

if err = reconcileBastion(scope, cluster, openStackCluster); err != nil {
return reconcile.Result{}, err
if result, err := reconcileBastion(scope, cluster, openStackCluster); err != nil {
return result, err
}

availabilityZones, err := computeService.GetAvailabilityZones()
Expand Down Expand Up @@ -308,82 +310,99 @@ func reconcileNormal(scope scope.Scope, cluster *clusterv1.Cluster, openStackClu
return reconcile.Result{}, nil
}

func reconcileBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster) error {
func reconcileBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster) (ctrl.Result, error) {
scope.Logger().Info("Reconciling Bastion")

if openStackCluster.Spec.Bastion == nil || !openStackCluster.Spec.Bastion.Enabled {
return deleteBastion(scope, cluster, openStackCluster)
return reconcile.Result{}, deleteBastion(scope, cluster, openStackCluster)
}

computeService, err := compute.NewService(scope)
if err != nil {
return err
return reconcile.Result{}, err
}

instanceSpec := bastionToInstanceSpec(openStackCluster, cluster.Name)
bastionHash, err := compute.HashInstanceSpec(instanceSpec)
if err != nil {
return fmt.Errorf("failed computing bastion hash from instance spec: %w", err)
return reconcile.Result{}, fmt.Errorf("failed computing bastion hash from instance spec: %w", err)
}

instanceStatus, err := computeService.GetInstanceStatusByName(openStackCluster, fmt.Sprintf("%s-bastion", cluster.Name))
if err != nil {
return err
var instanceStatus *compute.InstanceStatus
if openStackCluster.Status.Bastion != nil && openStackCluster.Status.Bastion.ID != "" {
if instanceStatus, err = computeService.GetInstanceStatus(openStackCluster.Status.Bastion.ID); err != nil {
return reconcile.Result{}, err
}
}
if instanceStatus != nil {
if instanceStatus == nil {
instanceStatus, err = computeService.CreateInstance(openStackCluster, openStackCluster, instanceSpec, cluster.Name)
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to create bastion: %w", err)
}
openStackCluster.Status.Bastion = &infrav1.BastionStatus{
ID: instanceStatus.ID(),
}
} else {
if !bastionHashHasChanged(bastionHash, openStackCluster.ObjectMeta.Annotations) {
bastion, err := instanceStatus.BastionStatus(openStackCluster)
if err != nil {
return err
return reconcile.Result{}, err
}
// Add the current hash if no annotation is set.
if _, ok := openStackCluster.ObjectMeta.Annotations[BastionInstanceHashAnnotation]; !ok {
annotations.AddAnnotations(openStackCluster, map[string]string{BastionInstanceHashAnnotation: bastionHash})
}
openStackCluster.Status.Bastion = bastion
return nil
return ctrl.Result{}, nil
}

if err := deleteBastion(scope, cluster, openStackCluster); err != nil {
return err
return ctrl.Result{}, err
}
}

instanceStatus, err = computeService.CreateInstance(openStackCluster, openStackCluster, instanceSpec, cluster.Name)
if err != nil {
return fmt.Errorf("failed to reconcile bastion: %w", err)
// Make sure that bastion instance has a valid state
switch instanceStatus.State() {
case infrav1.InstanceStateError:
return ctrl.Result{}, fmt.Errorf("failed to reconcile bastion, instance state is ERROR")
case infrav1.InstanceStateBuilding:
// Requeue until bastion becomes active
return ctrl.Result{RequeueAfter: waitForBuildingInstanceToReconcile}, nil
case infrav1.InstanceStateDeleted:
// This should normally be handled by deleteBastion
openStackCluster.Status.Bastion = nil
return ctrl.Result{}, nil
}

networkingService, err := networking.NewService(scope)
if err != nil {
return err
return ctrl.Result{}, err
}
clusterName := fmt.Sprintf("%s-%s", cluster.Namespace, cluster.Name)
fp, err := networkingService.GetOrCreateFloatingIP(openStackCluster, openStackCluster, clusterName, openStackCluster.Spec.Bastion.FloatingIP)
if err != nil {
handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to get or create floating IP for bastion: %w", err))
return fmt.Errorf("failed to get or create floating IP for bastion: %w", err)
return ctrl.Result{}, fmt.Errorf("failed to get or create floating IP for bastion: %w", err)
}
port, err := computeService.GetManagementPort(openStackCluster, instanceStatus)
if err != nil {
err = fmt.Errorf("getting management port for bastion: %w", err)
handleUpdateOSCError(openStackCluster, err)
return err
return ctrl.Result{}, err
}
err = networkingService.AssociateFloatingIP(openStackCluster, fp, port.ID)
if err != nil {
handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to associate floating IP with bastion: %w", err))
return fmt.Errorf("failed to associate floating IP with bastion: %w", err)
return ctrl.Result{}, fmt.Errorf("failed to associate floating IP with bastion: %w", err)
}

bastion, err := instanceStatus.BastionStatus(openStackCluster)
if err != nil {
return err
return ctrl.Result{}, err
}
bastion.FloatingIP = fp.FloatingIP
openStackCluster.Status.Bastion = bastion
annotations.AddAnnotations(openStackCluster, map[string]string{BastionInstanceHashAnnotation: bastionHash})
return nil
return ctrl.Result{}, nil
}

func bastionToInstanceSpec(openStackCluster *infrav1.OpenStackCluster, clusterName string) *compute.InstanceSpec {
Expand Down
15 changes: 10 additions & 5 deletions controllers/openstackcluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (

"github.com/go-logr/logr"
"github.com/golang/mock/gomock"
"github.com/gophercloud/gophercloud/openstack/compute/v2/servers"
"github.com/gophercloud/gophercloud"
"github.com/gophercloud/gophercloud/openstack/networking/v2/extensions/security/groups"
"github.com/gophercloud/gophercloud/openstack/networking/v2/networks"
"github.com/gophercloud/gophercloud/openstack/networking/v2/subnets"
Expand All @@ -38,7 +38,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/reconcile"

infrav1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1alpha7"
"sigs.k8s.io/cluster-api-provider-openstack/pkg/clients"
"sigs.k8s.io/cluster-api-provider-openstack/pkg/scope"
)

Expand Down Expand Up @@ -195,18 +194,24 @@ var _ = Describe("OpenStackCluster controller", func() {
Expect(err).To(BeNil())
err = k8sClient.Create(ctx, capiCluster)
Expect(err).To(BeNil())
testCluster.Status = infrav1.OpenStackClusterStatus{
Bastion: &infrav1.BastionStatus{
ID: "bastion-uuid",
},
}
err = k8sClient.Status().Update(ctx, testCluster)
Expect(err).To(BeNil())
scope, err := mockScopeFactory.NewClientScopeFromCluster(ctx, k8sClient, testCluster, nil, logr.Discard())
Expect(err).To(BeNil())

computeClientRecorder := mockScopeFactory.ComputeClient.EXPECT()
computeClientRecorder.ListServers(servers.ListOpts{
Name: "^capi-cluster-bastion$",
}).Return([]clients.ServerExt{}, nil)
computeClientRecorder.GetServer("bastion-uuid").Return(nil, gophercloud.ErrResourceNotFound{})

networkClientRecorder := mockScopeFactory.NetworkClient.EXPECT()
networkClientRecorder.ListSecGroup(gomock.Any()).Return([]groups.SecGroup{}, nil)

err = deleteBastion(scope, capiCluster, testCluster)
Expect(testCluster.Status.Bastion).To(BeNil())
Expect(err).To(BeNil())
})
It("should implicitly filter cluster subnets by cluster network", func() {
Expand Down
23 changes: 17 additions & 6 deletions controllers/openstackmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ type OpenStackMachineReconciler struct {
const (
waitForClusterInfrastructureReadyDuration = 15 * time.Second
waitForInstanceBecomeActiveToReconcile = 60 * time.Second
waitForBuildingInstanceToReconcile = 10 * time.Second
)

// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=openstackmachines,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -246,7 +247,10 @@ func (r *OpenStackMachineReconciler) reconcileDelete(scope scope.Scope, cluster
}
}

instanceStatus, err := computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name)
var instanceStatus *compute.InstanceStatus
if openStackMachine.Spec.InstanceID != nil {
instanceStatus, err = computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name)
}
if err != nil {
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -379,6 +383,9 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
scope.Logger().Info("Machine instance state is DELETED, no actions")
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeletedReason, clusterv1.ConditionSeverityError, "")
return ctrl.Result{}, nil
case infrav1.InstanceStateBuilding:
scope.Logger().Info("Waiting for instance to become ACTIVE", "id", instanceStatus.ID(), "status", instanceStatus.State())
return ctrl.Result{RequeueAfter: waitForBuildingInstanceToReconcile}, nil
default:
// The other state is normal (for example, migrating, shutoff) but we don't want to proceed until it's ACTIVE
// due to potential conflict or unexpected actions
Expand Down Expand Up @@ -431,11 +438,15 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
}

func (r *OpenStackMachineReconciler) getOrCreate(logger logr.Logger, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster, machine *clusterv1.Machine, openStackMachine *infrav1.OpenStackMachine, computeService *compute.Service, userData string) (*compute.InstanceStatus, error) {
instanceStatus, err := computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name)
if err != nil {
logger.Info("Unable to get OpenStack instance", "name", openStackMachine.Name)
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.OpenStackErrorReason, clusterv1.ConditionSeverityError, err.Error())
return nil, err
var instanceStatus *compute.InstanceStatus
var err error
if openStackMachine.Spec.InstanceID != nil {
instanceStatus, err = computeService.GetInstanceStatus(*openStackMachine.Spec.InstanceID)
if err != nil {
logger.Info("Unable to get OpenStack instance", "name", openStackMachine.Name)
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.OpenStackErrorReason, clusterv1.ConditionSeverityError, err.Error())
return nil, err
}
}

if instanceStatus == nil {
Expand Down
9 changes: 7 additions & 2 deletions controllers/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
"k8s.io/utils/pointer"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/test/framework"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -148,9 +149,13 @@ var _ = Describe("When calling getOrCreate", func() {
cluster := &clusterv1.Cluster{}
openStackCluster := &infrav1.OpenStackCluster{}
machine := &clusterv1.Machine{}
openStackMachine := &infrav1.OpenStackMachine{}
openStackMachine := &infrav1.OpenStackMachine{
Spec: infrav1.OpenStackMachineSpec{
InstanceID: pointer.String("machine-uuid"),
},
}

mockScopeFactory.ComputeClient.EXPECT().ListServers(gomock.Any()).Return(nil, errors.New("Test error when listing servers"))
mockScopeFactory.ComputeClient.EXPECT().GetServer(gomock.Any()).Return(nil, errors.New("Test error when getting server"))
instanceStatus, err := reconsiler.getOrCreate(logger, cluster, openStackCluster, machine, openStackMachine, computeService, "")
Expect(err).To(HaveOccurred())
Expect(instanceStatus).To(BeNil())
Expand Down
23 changes: 2 additions & 21 deletions pkg/cloud/services/compute/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,27 +333,8 @@ func (s *Service) createInstanceImpl(eventObject runtime.Object, openStackCluste
return nil, fmt.Errorf("error creating Openstack instance: %v", err)
}

var createdInstance *InstanceStatus
err = wait.PollUntilContextTimeout(context.TODO(), retryInterval, instanceCreateTimeout, true, func(_ context.Context) (bool, error) {
createdInstance, err = s.GetInstanceStatus(server.ID)
if err != nil {
if capoerrors.IsRetryable(err) {
return false, nil
}
return false, err
}
if createdInstance.State() == infrav1.InstanceStateError {
return false, fmt.Errorf("error creating OpenStack instance %s, status changed to error", createdInstance.ID())
}
return createdInstance.State() == infrav1.InstanceStateActive, nil
})
if err != nil {
record.Warnf(eventObject, "FailedCreateServer", "Failed to create server %s: %v", createdInstance.Name(), err)
return nil, err
}

record.Eventf(eventObject, "SuccessfulCreateServer", "Created server %s with id %s", createdInstance.Name(), createdInstance.ID())
return createdInstance, nil
record.Eventf(eventObject, "SuccessfulCreateServer", "Created server %s with id %s", server.Name, server.ID)
return &InstanceStatus{server, s.scope.Logger()}, nil
}

func volumeName(instanceName string, nameSuffix string) string {
Expand Down
Loading

0 comments on commit c940ccd

Please sign in to comment.