From 238a8df6fbb26b9d1e8cd2bd05103160655acbca Mon Sep 17 00:00:00 2001 From: Brad Ison Date: Wed, 23 Oct 2019 16:24:25 +0200 Subject: [PATCH] Search for existing instances by instance ID This changes the behavior of the actuator's `getMachineInstances` method to first look for a machine's backing instance by the instance ID in the provider status. If the instance ID is not set, it falls back to the previous method of filtering by tags. This should help prevent issues where AWS creates the instance, but takes some time to propagate tag values, which can result in Exists() wrongly returning false. --- pkg/actuators/machine/actuator.go | 18 ++++ pkg/actuators/machine/actuator_test.go | 120 +++++++++++++++++++++++++ pkg/actuators/machine/utils.go | 64 +++++++++++-- 3 files changed, 195 insertions(+), 7 deletions(-) diff --git a/pkg/actuators/machine/actuator.go b/pkg/actuators/machine/actuator.go index e94e925c1e..a958895d8d 100644 --- a/pkg/actuators/machine/actuator.go +++ b/pkg/actuators/machine/actuator.go @@ -538,6 +538,24 @@ func (a *Actuator) getMachineInstances(cluster *clusterv1.Cluster, machine *mach return nil, err } + status := &providerconfigv1.AWSMachineProviderStatus{} + err = a.codec.DecodeProviderStatus(machine.Status.ProviderStatus, status) + + // If the status was decoded successfully, and there is a non-empty instance + // ID, search using that, otherwise fallback to filtering based on tags. + if err == nil && status.InstanceID != nil && *status.InstanceID != "" { + i, err := getExistingInstanceByID(*status.InstanceID, client) + + if err != nil { + glog.Warningf("%s: Failed to find running instance by id %s: %v", + machine.Name, *status.InstanceID, err) + } else { + glog.Infof("%s: Found instance by id: %s", machine.Name, *status.InstanceID) + + return []*ec2.Instance{i}, nil + } + } + return getExistingInstances(machine, client) } diff --git a/pkg/actuators/machine/actuator_test.go b/pkg/actuators/machine/actuator_test.go index 08401517ef..0b83023600 100644 --- a/pkg/actuators/machine/actuator_test.go +++ b/pkg/actuators/machine/actuator_test.go @@ -1135,3 +1135,123 @@ func TestCreate(t *testing.T) { } } } + +func TestGetMachineInstances(t *testing.T) { + clusterID := "aws-actuator-cluster" + instanceID := "i-02fa4197109214b46" + imageID := "ami-a9acbbd6" + + machine, err := stubMachine() + if err != nil { + t.Fatalf("unable to build stub machine: %v", err) + } + + codec, err := providerconfigv1.NewCodec() + if err != nil { + t.Fatalf("unable to build codec: %v", err) + } + + testCases := []struct { + testcase string + providerStatus providerconfigv1.AWSMachineProviderStatus + awsClientFunc func(*gomock.Controller) awsclient.Client + }{ + { + testcase: "empty-status-search-by-tag", + providerStatus: providerconfigv1.AWSMachineProviderStatus{}, + awsClientFunc: func(ctrl *gomock.Controller) awsclient.Client { + mockAWSClient := mockaws.NewMockClient(ctrl) + + request := &ec2.DescribeInstancesInput{ + Filters: []*ec2.Filter{ + { + Name: awsTagFilter("Name"), + Values: aws.StringSlice([]string{machine.Name}), + }, + + clusterFilter(clusterID), + + { + Name: aws.String("instance-state-name"), + Values: existingInstanceStates(), + }, + }, + } + + mockAWSClient.EXPECT().DescribeInstances(request).Return( + stubDescribeInstancesOutput(imageID, instanceID), + nil, + ).Times(1) + + return mockAWSClient + }, + }, + { + testcase: "has-status-search-by-id", + providerStatus: providerconfigv1.AWSMachineProviderStatus{ + InstanceID: aws.String(instanceID), + }, + awsClientFunc: func(ctrl *gomock.Controller) awsclient.Client { + mockAWSClient := mockaws.NewMockClient(ctrl) + + request := &ec2.DescribeInstancesInput{ + Filters: []*ec2.Filter{ + { + Name: aws.String("instance-id"), + Values: aws.StringSlice([]string{instanceID}), + }, + { + Name: aws.String("instance-state-name"), + Values: existingInstanceStates(), + }, + }, + } + + mockAWSClient.EXPECT().DescribeInstances(request).Return( + stubDescribeInstancesOutput(imageID, instanceID), + nil, + ).Times(1) + + return mockAWSClient + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.testcase, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + awsStatusRaw, err := codec.EncodeProviderStatus(&tc.providerStatus) + if err != nil { + t.Errorf("Error encoding ProviderStatus: %v", err) + } + + machineCopy := machine.DeepCopy() + machineCopy.Status.ProviderStatus = awsStatusRaw + + awsClient := tc.awsClientFunc(ctrl) + + params := ActuatorParams{ + Codec: codec, + AwsClientBuilder: awsClientBuilderFunc(awsClient), + } + + actuator, err := NewActuator(params) + if err != nil { + t.Errorf("Error creating Actuator: %v", err) + } + + _, err = actuator.getMachineInstances(nil, machineCopy) + if err != nil { + t.Errorf("Unexpected error from getMachineInstances: %v", err) + } + }) + } +} + +func awsClientBuilderFunc(c awsclient.Client) awsclient.AwsClientBuilderFuncType { + return func(_ client.Client, _, _, _ string) (awsclient.Client, error) { + return c, nil + } +} diff --git a/pkg/actuators/machine/utils.go b/pkg/actuators/machine/utils.go index 98bebcc2e1..7cb59769bf 100644 --- a/pkg/actuators/machine/utils.go +++ b/pkg/actuators/machine/utils.go @@ -33,6 +33,18 @@ import ( awsclient "sigs.k8s.io/cluster-api-provider-aws/pkg/client" ) +// existingInstanceStates returns the list of states an EC2 instance can be in +// while being considered "existing", i.e. mostly anything but "Terminated". +func existingInstanceStates() []*string { + return []*string{ + aws.String(ec2.InstanceStateNameRunning), + aws.String(ec2.InstanceStateNamePending), + aws.String(ec2.InstanceStateNameStopped), + aws.String(ec2.InstanceStateNameStopping), + aws.String(ec2.InstanceStateNameShuttingDown), + } +} + // getRunningInstance returns the AWS instance for a given machine. If multiple instances match our machine, // the most recently launched will be returned. If no instance exists, an error will be returned. func getRunningInstance(machine *machinev1.Machine, client awsclient.Client) (*ec2.Instance, error) { @@ -74,13 +86,51 @@ func getStoppedInstances(machine *machinev1.Machine, client awsclient.Client) ([ } func getExistingInstances(machine *machinev1.Machine, client awsclient.Client) ([]*ec2.Instance, error) { - return getInstances(machine, client, []*string{ - aws.String(ec2.InstanceStateNameRunning), - aws.String(ec2.InstanceStateNamePending), - aws.String(ec2.InstanceStateNameStopped), - aws.String(ec2.InstanceStateNameStopping), - aws.String(ec2.InstanceStateNameShuttingDown), - }) + return getInstances(machine, client, existingInstanceStates()) +} + +func getExistingInstanceByID(id string, client awsclient.Client) (*ec2.Instance, error) { + return getInstanceByID(id, client, existingInstanceStates()) +} + +// getInstanceByID returns the instance with the given ID if it exists. +func getInstanceByID(id string, client awsclient.Client, instanceStateFilter []*string) (*ec2.Instance, error) { + if id == "" { + return nil, fmt.Errorf("instance-id not specified") + } + + requestFilters := []*ec2.Filter{ + { + Name: aws.String("instance-id"), + Values: aws.StringSlice([]string{id}), + }, + } + + if instanceStateFilter != nil { + requestFilters = append(requestFilters, &ec2.Filter{ + Name: aws.String("instance-state-name"), + Values: instanceStateFilter, + }) + } + + request := &ec2.DescribeInstancesInput{Filters: requestFilters} + + result, err := client.DescribeInstances(request) + if err != nil { + return nil, err + } + + if len(result.Reservations) != 1 { + return nil, fmt.Errorf("found %d reservations for instance-id %s", len(result.Reservations), id) + } + + reservation := result.Reservations[0] + + if len(reservation.Instances) != 1 { + return nil, fmt.Errorf("found %d instances for instance-id %s", len(reservation.Instances), id) + } + + return reservation.Instances[0], nil } // getInstances returns all instances that have a tag matching our machine name,