Skip to content

Commit

Permalink
Handle stale instance metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
Claes Mogren authored and mogren committed Jun 8, 2020
1 parent 2e47284 commit 3aea95d
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 23 deletions.
67 changes: 54 additions & 13 deletions pkg/awsutils/awsutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"fmt"
"math/rand"
"os"
"regexp"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -55,7 +56,7 @@ const (
metadataInterface = "/interface-id/"
metadataSubnetCIDR = "/subnet-ipv4-cidr-block"
metadataIPv4s = "/local-ipv4s"
maxENIDeleteRetries = 12
maxENIEC2APIRetries = 12
maxENIBackoffDelay = time.Minute
eniDescriptionPrefix = "aws-K8S-"
metadataOwnerID = "/owner-id"
Expand Down Expand Up @@ -820,7 +821,7 @@ func (cache *EC2InstanceMetadataCache) freeENI(eniName string, sleepDelayAfterDe
}

// Retry detaching the ENI from the instance
err = retry.RetryNWithBackoff(retry.NewSimpleBackoff(time.Millisecond*200, maxBackoffDelay, 0.15, 2.0), maxENIDeleteRetries, func() error {
err = retry.RetryNWithBackoff(retry.NewSimpleBackoff(time.Millisecond*200, maxBackoffDelay, 0.15, 2.0), maxENIEC2APIRetries, func() error {
start := time.Now()
_, ec2Err := cache.ec2SVC.DetachNetworkInterfaceWithContext(context.Background(), detachInput, userAgent)
awsAPILatency.WithLabelValues("DetachNetworkInterface", fmt.Sprint(ec2Err != nil)).Observe(msSince(start))
Expand Down Expand Up @@ -890,7 +891,7 @@ func (cache *EC2InstanceMetadataCache) deleteENI(eniName string, maxBackoffDelay
deleteInput := &ec2.DeleteNetworkInterfaceInput{
NetworkInterfaceId: aws.String(eniName),
}
err := retry.RetryNWithBackoff(retry.NewSimpleBackoff(time.Millisecond*500, maxBackoffDelay, 0.15, 2.0), maxENIDeleteRetries, func() error {
err := retry.RetryNWithBackoff(retry.NewSimpleBackoff(time.Millisecond*500, maxBackoffDelay, 0.15, 2.0), maxENIEC2APIRetries, func() error {
start := time.Now()
_, ec2Err := cache.ec2SVC.DeleteNetworkInterfaceWithContext(context.Background(), deleteInput, userAgent)
awsAPILatency.WithLabelValues("DeleteNetworkInterface", fmt.Sprint(ec2Err != nil)).Observe(msSince(start))
Expand Down Expand Up @@ -955,20 +956,49 @@ func (cache *EC2InstanceMetadataCache) DescribeAllENIs() ([]ENIMetadata, map[str
eniIDs = append(eniIDs, eni.ENIID)
eniMap[eni.ENIID] = eni
}
input := &ec2.DescribeNetworkInterfacesInput{NetworkInterfaceIds: aws.StringSlice(eniIDs)}

start := time.Now()
ec2Response, err := cache.ec2SVC.DescribeNetworkInterfacesWithContext(context.Background(), input, userAgent)
awsAPILatency.WithLabelValues("DescribeNetworkInterfaces", fmt.Sprint(err != nil)).Observe(msSince(start))
if err != nil {
var ec2Response *ec2.DescribeNetworkInterfacesOutput
// Try calling EC2 to describe the interfaces.
for retryCount := 0; retryCount < maxENIEC2APIRetries && len(eniIDs) > 0; retryCount++ {
input := &ec2.DescribeNetworkInterfacesInput{NetworkInterfaceIds: aws.StringSlice(eniIDs)}
start := time.Now()
ec2Response, err = cache.ec2SVC.DescribeNetworkInterfacesWithContext(context.Background(), input, userAgent)
awsAPILatency.WithLabelValues("DescribeNetworkInterfaces", fmt.Sprint(err != nil)).Observe(msSince(start))
if err == nil {
// No error, exit the loop
break
}
awsAPIErrInc("DescribeNetworkInterfaces", err)
log.Errorf("Failed to call ec2:DescribeNetworkInterfaces for %v: %v", aws.StringValueSlice(input.NetworkInterfaceIds), err)
if aerr, ok := err.(awserr.Error); ok {
if aerr.Code() == "InvalidNetworkInterfaceID.NotFound" {
return nil, nil, ErrENINotFound
badENIID := badENIID(aerr.Message())
log.Debugf("Could not find interface: %s, ID: %s", aerr.Message(), badENIID)
// Remove this ENI from the map
delete(eniMap, badENIID)
// Remove the failing ENI ID from the EC2 API request and try again
var tmpENIIDs []string
for _, eniID := range eniIDs {
if eniID != badENIID {
tmpENIIDs = append(tmpENIIDs, eniID)
}
}
eniIDs = tmpENIIDs
continue
}
}
awsAPIErrInc("DescribeNetworkInterfaces", err)
log.Errorf("Failed to call ec2:DescribeNetworkInterfaces for %v: %v", eniIDs, err)
return nil, nil, errors.Wrap(err, "failed to describe network interfaces")
// For other errors sleep a short while before the next retry
time.Sleep(time.Duration(retryCount*10) * time.Millisecond)
}

if err != nil {
return nil, nil, err
}

// Collect the verified ENIs
var verifiedENIs []ENIMetadata
for _, eniMetadata := range eniMap {
verifiedENIs = append(verifiedENIs, eniMetadata)
}

// Collect ENI response into ENI metadata and tags.
Expand All @@ -990,7 +1020,18 @@ func (cache *EC2InstanceMetadataCache) DescribeAllENIs() ([]ENIMetadata, map[str
tagMap[eniMetadata.ENIID] = tags
}
}
return allENIs, tagMap, nil
return verifiedENIs, tagMap, nil
}

var eniErrorMessageRegex = regexp.MustCompile("'([a-zA-Z0-9-]+)'")

func badENIID(errMsg string) string {
found := eniErrorMessageRegex.FindStringSubmatch(errMsg)
if found == nil || len(found) < 2 {
return ""
}
fmt.Printf("found=%v\n", found)
return found[1]
}

// logOutOfSyncState compares the IP and metadata returned by IMDS and the EC2 API DescribeNetworkInterfaces calls
Expand Down
27 changes: 17 additions & 10 deletions pkg/awsutils/awsutils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,12 +371,6 @@ func TestDescribeAllENIs(t *testing.T) {
ctrl, mockMetadata, mockEC2 := setup(t)
defer ctrl.Finish()

mockMetadata.EXPECT().GetMetadata(metadataMACPath).Times(2).Return(primaryMAC, nil)
mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataDeviceNum).Times(2).Return(eni1Device, nil)
mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataInterface).Times(2).Return(eniID, nil)
mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataSubnetCIDR).Times(2).Return(subnetCIDR, nil)
mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataIPv4s).Times(2).Return("", nil)

result := &ec2.DescribeNetworkInterfacesOutput{
NetworkInterfaces: []*ec2.NetworkInterface{{
TagSet: []*ec2.Tag{
Expand All @@ -385,18 +379,31 @@ func TestDescribeAllENIs(t *testing.T) {
}},
}

expectedError := awserr.New("InvalidNetworkInterfaceID.NotFound", "no 'eni-xxx'", nil)
noMessageError := awserr.New("InvalidNetworkInterfaceID.NotFound", "no message", nil)
err := errors.New("other Error")

testCases := []struct {
name string
exptags map[string]TagMap
n int
awsErr error
expErr error
}{
{"success DescribeENI", map[string]TagMap{"": {"foo": "foo-value"}}, nil, nil},
{"not found error", nil, awserr.New("InvalidNetworkInterfaceID.NotFound", "", nil), ErrENINotFound},
{"Success DescribeENI", map[string]TagMap{"": {"foo": "foo-value"}}, 1, nil, nil},
{"Not found error", nil, maxENIEC2APIRetries, awserr.New("InvalidNetworkInterfaceID.NotFound", "no 'eni-xxx'", nil), expectedError},
{"Not found, no message", nil, maxENIEC2APIRetries, awserr.New("InvalidNetworkInterfaceID.NotFound", "no message", nil), noMessageError},
{"Other error", nil, maxENIEC2APIRetries, err, err},
}

mockMetadata.EXPECT().GetMetadata(metadataMACPath).Times(len(testCases)).Return(primaryMAC, nil)
mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataDeviceNum).Times(len(testCases)).Return(eni1Device, nil)
mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataInterface).Times(len(testCases)).Return(eniID, nil)
mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataSubnetCIDR).Times(len(testCases)).Return(subnetCIDR, nil)
mockMetadata.EXPECT().GetMetadata(metadataMACPath+primaryMAC+metadataIPv4s).Times(len(testCases)).Return("", nil)

for _, tc := range testCases {
mockEC2.EXPECT().DescribeNetworkInterfacesWithContext(gomock.Any(), gomock.Any(), gomock.Any()).Return(result, tc.awsErr)
mockEC2.EXPECT().DescribeNetworkInterfacesWithContext(gomock.Any(), gomock.Any(), gomock.Any()).Times(tc.n).Return(result, tc.awsErr)
ins := &EC2InstanceMetadataCache{ec2Metadata: mockMetadata, ec2SVC: mockEC2}
_, tags, err := ins.DescribeAllENIs()
assert.Equal(t, tc.expErr, err, tc.name)
Expand Down Expand Up @@ -632,7 +639,7 @@ func TestFreeENIRetryMax(t *testing.T) {
mockEC2.EXPECT().DescribeNetworkInterfacesWithContext(gomock.Any(), gomock.Any(), gomock.Any()).Return(result, nil)
mockEC2.EXPECT().DetachNetworkInterfaceWithContext(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil)

for i := 0; i < maxENIDeleteRetries; i++ {
for i := 0; i < maxENIEC2APIRetries; i++ {
mockEC2.EXPECT().DeleteNetworkInterfaceWithContext(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, errors.New("testing retrying delete"))
}

Expand Down

0 comments on commit 3aea95d

Please sign in to comment.