From e0e99a9bd40b083591105be515bb24eae2ee6ec0 Mon Sep 17 00:00:00 2001 From: Claes Mogren Date: Fri, 5 Jun 2020 11:32:34 -0700 Subject: [PATCH] Handle stale instance metadata --- pkg/awsutils/awsutils.go | 67 ++++++++++++++++++++++++++++------- pkg/awsutils/awsutils_test.go | 27 ++++++++------ 2 files changed, 71 insertions(+), 23 deletions(-) diff --git a/pkg/awsutils/awsutils.go b/pkg/awsutils/awsutils.go index 6c02a004516..a89a0a885ad 100644 --- a/pkg/awsutils/awsutils.go +++ b/pkg/awsutils/awsutils.go @@ -19,6 +19,7 @@ import ( "fmt" "math/rand" "os" + "regexp" "strconv" "strings" "time" @@ -55,7 +56,7 @@ const ( metadataInterface = "/interface-id/" metadataSubnetCIDR = "/subnet-ipv4-cidr-block" metadataIPv4s = "/local-ipv4s" - maxENIDeleteRetries = 12 + maxENIEC2APIRetries = 12 maxENIBackoffDelay = time.Minute eniDescriptionPrefix = "aws-K8S-" metadataOwnerID = "/owner-id" @@ -820,7 +821,7 @@ func (cache *EC2InstanceMetadataCache) freeENI(eniName string, sleepDelayAfterDe } // Retry detaching the ENI from the instance - err = retry.RetryNWithBackoff(retry.NewSimpleBackoff(time.Millisecond*200, maxBackoffDelay, 0.15, 2.0), maxENIDeleteRetries, func() error { + err = retry.RetryNWithBackoff(retry.NewSimpleBackoff(time.Millisecond*200, maxBackoffDelay, 0.15, 2.0), maxENIEC2APIRetries, func() error { start := time.Now() _, ec2Err := cache.ec2SVC.DetachNetworkInterfaceWithContext(context.Background(), detachInput, userAgent) awsAPILatency.WithLabelValues("DetachNetworkInterface", fmt.Sprint(ec2Err != nil)).Observe(msSince(start)) @@ -890,7 +891,7 @@ func (cache *EC2InstanceMetadataCache) deleteENI(eniName string, maxBackoffDelay deleteInput := &ec2.DeleteNetworkInterfaceInput{ NetworkInterfaceId: aws.String(eniName), } - err := retry.RetryNWithBackoff(retry.NewSimpleBackoff(time.Millisecond*500, maxBackoffDelay, 0.15, 2.0), maxENIDeleteRetries, func() error { + err := retry.RetryNWithBackoff(retry.NewSimpleBackoff(time.Millisecond*500, maxBackoffDelay, 0.15, 2.0), maxENIEC2APIRetries, func() error { start := time.Now() _, ec2Err := cache.ec2SVC.DeleteNetworkInterfaceWithContext(context.Background(), deleteInput, userAgent) awsAPILatency.WithLabelValues("DeleteNetworkInterface", fmt.Sprint(ec2Err != nil)).Observe(msSince(start)) @@ -955,20 +956,49 @@ func (cache *EC2InstanceMetadataCache) DescribeAllENIs() ([]ENIMetadata, map[str eniIDs = append(eniIDs, eni.ENIID) eniMap[eni.ENIID] = eni } - input := &ec2.DescribeNetworkInterfacesInput{NetworkInterfaceIds: aws.StringSlice(eniIDs)} - start := time.Now() - ec2Response, err := cache.ec2SVC.DescribeNetworkInterfacesWithContext(context.Background(), input, userAgent) - awsAPILatency.WithLabelValues("DescribeNetworkInterfaces", fmt.Sprint(err != nil)).Observe(msSince(start)) - if err != nil { + var ec2Response *ec2.DescribeNetworkInterfacesOutput + // Try calling EC2 to describe the interfaces. + for retryCount := 0; retryCount < maxENIEC2APIRetries && len(eniIDs) > 0; retryCount++ { + input := &ec2.DescribeNetworkInterfacesInput{NetworkInterfaceIds: aws.StringSlice(eniIDs)} + start := time.Now() + ec2Response, err = cache.ec2SVC.DescribeNetworkInterfacesWithContext(context.Background(), input, userAgent) + awsAPILatency.WithLabelValues("DescribeNetworkInterfaces", fmt.Sprint(err != nil)).Observe(msSince(start)) + if err == nil { + // No error, exit the loop + break + } + awsAPIErrInc("DescribeNetworkInterfaces", err) + log.Errorf("Failed to call ec2:DescribeNetworkInterfaces for %v: %v", aws.StringValueSlice(input.NetworkInterfaceIds), err) if aerr, ok := err.(awserr.Error); ok { if aerr.Code() == "InvalidNetworkInterfaceID.NotFound" { - return nil, nil, ErrENINotFound + badENIID := badENIID(aerr.Message()) + log.Debugf("Could not find interface: %s, ID: %s", aerr.Message(), badENIID) + // Remove this ENI from the map + delete(eniMap, badENIID) + // Remove the failing ENI ID from the EC2 API request and try again + var tmpENIIDs []string + for _, eniID := range eniIDs { + if eniID != badENIID { + tmpENIIDs = append(tmpENIIDs, eniID) + } + } + eniIDs = tmpENIIDs + continue } } - awsAPIErrInc("DescribeNetworkInterfaces", err) - log.Errorf("Failed to call ec2:DescribeNetworkInterfaces for %v: %v", eniIDs, err) - return nil, nil, errors.Wrap(err, "failed to describe network interfaces") + // For other errors sleep a short while before the next retry + time.Sleep(time.Duration(retryCount*10) * time.Millisecond) + } + + if err != nil { + return nil, nil, err + } + + // Collect the verified ENIs + var verifiedENIs []ENIMetadata + for _, eniMetadata := range eniMap { + verifiedENIs = append(verifiedENIs, eniMetadata) } // Collect ENI response into ENI metadata and tags. @@ -990,7 +1020,18 @@ func (cache *EC2InstanceMetadataCache) DescribeAllENIs() ([]ENIMetadata, map[str tagMap[eniMetadata.ENIID] = tags } } - return allENIs, tagMap, nil + return verifiedENIs, tagMap, nil +} + +var eniErrorMessageRegex = regexp.MustCompile("'([a-zA-Z0-9-]+)'") + +func badENIID(errMsg string) string { + found := eniErrorMessageRegex.FindStringSubmatch(errMsg) + if found == nil || len(found) < 2 { + return "" + } + fmt.Printf("found=%v\n", found) + return found[1] } // logOutOfSyncState compares the IP and metadata returned by IMDS and the EC2 API DescribeNetworkInterfaces calls diff --git a/pkg/awsutils/awsutils_test.go b/pkg/awsutils/awsutils_test.go index de0e51e9485..a294e0f08fa 100644 --- a/pkg/awsutils/awsutils_test.go +++ b/pkg/awsutils/awsutils_test.go @@ -371,12 +371,6 @@ func TestDescribeAllENIs(t *testing.T) { ctrl, mockMetadata, mockEC2 := setup(t) defer ctrl.Finish() - mockMetadata.EXPECT().GetMetadata(metadataMACPath).Times(2).Return(primaryMAC, nil) - mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataDeviceNum).Times(2).Return(eni1Device, nil) - mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataInterface).Times(2).Return(eniID, nil) - mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataSubnetCIDR).Times(2).Return(subnetCIDR, nil) - mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataIPv4s).Times(2).Return("", nil) - result := &ec2.DescribeNetworkInterfacesOutput{ NetworkInterfaces: []*ec2.NetworkInterface{{ TagSet: []*ec2.Tag{ @@ -385,18 +379,31 @@ func TestDescribeAllENIs(t *testing.T) { }}, } + expectedError := awserr.New("InvalidNetworkInterfaceID.NotFound", "no 'eni-xxx'", nil) + noMessageError := awserr.New("InvalidNetworkInterfaceID.NotFound", "no message", nil) + err := errors.New("other Error") + testCases := []struct { name string exptags map[string]TagMap + n int awsErr error expErr error }{ - {"success DescribeENI", map[string]TagMap{"": {"foo": "foo-value"}}, nil, nil}, - {"not found error", nil, awserr.New("InvalidNetworkInterfaceID.NotFound", "", nil), ErrENINotFound}, + {"Success DescribeENI", map[string]TagMap{"": {"foo": "foo-value"}}, 1, nil, nil}, + {"Not found error", nil, maxENIEC2APIRetries, awserr.New("InvalidNetworkInterfaceID.NotFound", "no 'eni-xxx'", nil), expectedError}, + {"Not found, no message", nil, maxENIEC2APIRetries, awserr.New("InvalidNetworkInterfaceID.NotFound", "no message", nil), noMessageError}, + {"Other error", nil, maxENIEC2APIRetries, err, err}, } + mockMetadata.EXPECT().GetMetadata(metadataMACPath).Times(len(testCases)).Return(primaryMAC, nil) + mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataDeviceNum).Times(len(testCases)).Return(eni1Device, nil) + mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataInterface).Times(len(testCases)).Return(eniID, nil) + mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataSubnetCIDR).Times(len(testCases)).Return(subnetCIDR, nil) + mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataIPv4s).Times(len(testCases)).Return("", nil) + for _, tc := range testCases { - mockEC2.EXPECT().DescribeNetworkInterfacesWithContext(gomock.Any(), gomock.Any(), gomock.Any()).Return(result, tc.awsErr) + mockEC2.EXPECT().DescribeNetworkInterfacesWithContext(gomock.Any(), gomock.Any(), gomock.Any()).Times(tc.n).Return(result, tc.awsErr) ins := &EC2InstanceMetadataCache{ec2Metadata: mockMetadata, ec2SVC: mockEC2} _, tags, err := ins.DescribeAllENIs() assert.Equal(t, tc.expErr, err, tc.name) @@ -632,7 +639,7 @@ func TestFreeENIRetryMax(t *testing.T) { mockEC2.EXPECT().DescribeNetworkInterfacesWithContext(gomock.Any(), gomock.Any(), gomock.Any()).Return(result, nil) mockEC2.EXPECT().DetachNetworkInterfaceWithContext(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) - for i := 0; i < maxENIDeleteRetries; i++ { + for i := 0; i < maxENIEC2APIRetries; i++ { mockEC2.EXPECT().DeleteNetworkInterfaceWithContext(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, errors.New("testing retrying delete")) }