From 32bc3854439ab3674bab3c4146494232bb997779 Mon Sep 17 00:00:00 2001 From: Yoriyasu Yano <430092+yorinasub17@users.noreply.github.com> Date: Thu, 10 Mar 2022 17:16:06 -0600 Subject: [PATCH] Update cloudwatch nuking to support filtering and concurrent deletes --- .circleci/nuke_config.yml | 45 ++++++++++++ README.md | 48 +++++++------ aws/aws.go | 2 +- aws/cloudwatch_loggroup.go | 118 ++++++++++++++++++++++++++----- aws/cloudwatch_loggroup_test.go | 36 ++++++++-- aws/cloudwatch_loggroup_types.go | 6 +- config/config.go | 1 + config/config_test.go | 1 + 8 files changed, 208 insertions(+), 49 deletions(-) diff --git a/.circleci/nuke_config.yml b/.circleci/nuke_config.yml index b3908cf1..d97b5076 100644 --- a/.circleci/nuke_config.yml +++ b/.circleci/nuke_config.yml @@ -39,3 +39,48 @@ OIDCProvider: names_regex: # We have an active OIDC Provider used by github actions - ".*token.actions.githubusercontent.com.*" + +CloudWatchLogGroup: + # Use an allow list instead of block list because we haven't done the due diligence yet to figure out what log groups + # we need to keep. This is hard to do in the current scenario as we have thousands of log groups in the accounts. + include: + names_regex: + - "/aws/containerinsights/eks-cluster-[a-zA-Z0-9]{6}/.*" + - "/aws/containerinsights/eks-service-catalog-[a-zA-Z0-9]{6}/.*" + - "/aws/ecs/containerinsights/[a-zA-Z0-9]{6}-cluster/.*" + - "/aws/ecs/containerinsights/[a-zA-Z0-9]{6}-ecs-cluster/.*" + - "/aws/ecs/containerinsights/Test-cluster[a-zA-Z0-9]{6}/.*" + - "/aws/eks/eks-cluster-[a-zA-Z0-9]{6}.*" + - "/aws/eks/eks-service-catalog-[a-zA-Z0-9]{6}.*" + - "/aws/lambda/Test.*" + - "/aws/lambda/[a-zA-Z0-9]{6}-ecs-deploy-runner.*" + - "/aws/lambda/ecs-deploy-runner-[a-zA-Z0-9]{6}-invoker$" + - "/aws/lambda/es-cluster-[a-zA-Z0-9]{6}.*" + - "/aws/lambda/jenkins-[a-zA-Z0-9]{6}.*" + - "/aws/lambda/jenkins[a-zA-Z0-9]{6}-0-backup$" + - "/aws/lambda/test-aurora-[a-zA-Z0-9]{6}.*" + - "/aws/lambda/[a-zA-Z0-9]{6}-handler$" + - "/aws/lambda/test[a-zA-Z0-9]{6}.*" + - "/aws/lambda/lambda-[a-zA-Z0-9]{6}$" + - "/aws/lambda/us-east-1\\.[a-zA-Z0-9]{6}cloudfront.*" + - "/aws/lambda/cleanup-expired-iam-certs-[a-zA-Z0-9]{6}$" + - "/aws/lambda/create-snapshot-example-(aurora|mysql)$" + - "/aws/lambda/HoustonExpress[a-zA-Z0-9]{6}.*" + - "/aws/rds/cluster/test[a-zA-Z0-9]{6}.*" + - "/aws/rds/cluster/test-aurora-[a-zA-Z0-9]{6}.*" + - "eks-service-catalog-[a-zA-Z0-9]{6}$" + - "eks-cluster-[a-zA-Z0-9]{6}-container-logs$" + - "es-[a-zA-Z0-9]{6}-lg$" + - "TestCloudWatchLogAggregation.*" + - "API-Gateway-Execution-Logs_.*/example$" + - "asg-[a-zA-Z0-9]{6}$" + - "^[a-zA-Z0-9]{6}-service$" + - "^[a-zA-Z0-9]{6}-ecs-deploy-runner-[a-zA-Z0-9]{6}$" + - "ecs-deploy-runner-[a-zA-Z0-9]{6}$" + - "^[a-zA-Z0-9]{6}(stage|prod)-ec2-syslog$" + - "Ubuntu1804-[a-zA-Z0-9]{6}-logs$" + - "bastion-host-[a-zA-Z0-9]{6}$" + - "ec2-instance-[a-zA-Z0-9]{6}$" + - "jenkins-[a-zA-Z0-9]{6}$" + - "openvpn-server-[a-zA-Z0-9]{6}_log_group$" + - "vpc-test-[a-zA-Z0-9]{6}.*" diff --git a/README.md b/README.md index 546fc005..b7f33cc8 100644 --- a/README.md +++ b/README.md @@ -198,6 +198,9 @@ The following resources support the Config file: - IAM OpenID Connect Providers - Resource type: `oidcprovider` - Config key: `OIDCProvider` +- CloudWatch LogGroups + - Resource type: `cloudwatch-loggroup` + - Config key: `CloudWatchLogGroup` - KMS customer keys - Resource type: `kmscustomerkeys` @@ -288,28 +291,29 @@ Be careful when nuking and append the `--dry-run` option if you're unsure. Even To find out what we options are supported in the config file today, consult this table. Resource types at the top level of the file that are supported are listed here. -| resource type | names | names_regex | tags | tags_regex | -|--------------------|-------|-------------|------|------------| -| s3 | none | ✅ | none | none | -| iam | none | ✅ | none | none | -| ecsserv | none | ✅ | none | none | -| ecscluster | none | ✅ | none | none | -| secretsmanager | none | ✅ | none | none | -| nat-gateway | none | ✅ | none | none | -| accessanalyzer | none | ✅ | none | none | -| dynamodb | none | ✅ | none | none | -| ebs | none | ✅ | none | none | -| lambda | none | ✅ | none | none | -| elbv2 | none | ✅ | none | none | -| ecs | none | ✅ | none | none | -| elasticache | none | ✅ | none | none | -| vpc | none | ✅ | none | none | -| oidcprovider | none | ✅ | none | none | -| kmscustomerkeys | none | ✅ | none | none | -| acmpca | none | none | none | none | -| ec2 instance | none | none | none | none | -| iam role | none | none | none | none | -| ... (more to come) | none | none | none | none | +| resource type | names | names_regex | tags | tags_regex | +|---------------------|-------|-------------|------|------------| +| s3 | none | ✅ | none | none | +| iam | none | ✅ | none | none | +| ecsserv | none | ✅ | none | none | +| ecscluster | none | ✅ | none | none | +| secretsmanager | none | ✅ | none | none | +| nat-gateway | none | ✅ | none | none | +| accessanalyzer | none | ✅ | none | none | +| dynamodb | none | ✅ | none | none | +| ebs | none | ✅ | none | none | +| lambda | none | ✅ | none | none | +| elbv2 | none | ✅ | none | none | +| ecs | none | ✅ | none | none | +| elasticache | none | ✅ | none | none | +| vpc | none | ✅ | none | none | +| oidcprovider | none | ✅ | none | none | +| cloudwatch-loggroup | none | ✅ | none | none | +| kmscustomerkeys | none | ✅ | none | none | +| acmpca | none | none | none | none | +| ec2 instance | none | none | none | none | +| iam role | none | none | none | none | +| ... (more to come) | none | none | none | none | diff --git a/aws/aws.go b/aws/aws.go index 8a8b9112..a6cebe1f 100644 --- a/aws/aws.go +++ b/aws/aws.go @@ -609,7 +609,7 @@ func GetAllResources(targetRegions []string, excludeAfter time.Time, resourceTyp // CloudWatchLogGroup cloudwatchLogGroups := CloudWatchLogGroups{} if IsNukeable(cloudwatchLogGroups.ResourceName(), resourceTypes) { - lgNames, err := getAllCloudWatchLogGroups(session, region) + lgNames, err := getAllCloudWatchLogGroups(session, region, excludeAfter, configObj) if err != nil { return nil, errors.WithStackTrace(err) } diff --git a/aws/cloudwatch_loggroup.go b/aws/cloudwatch_loggroup.go index 4d2a3b49..725daa63 100644 --- a/aws/cloudwatch_loggroup.go +++ b/aws/cloudwatch_loggroup.go @@ -1,29 +1,62 @@ package aws import ( + "sync" + "time" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/cloudwatchlogs" + "github.com/gruntwork-io/cloud-nuke/config" "github.com/gruntwork-io/cloud-nuke/logging" "github.com/gruntwork-io/go-commons/errors" + "github.com/hashicorp/go-multierror" ) -func getAllCloudWatchLogGroups(session *session.Session, region string) ([]*string, error) { +func getAllCloudWatchLogGroups(session *session.Session, region string, excludeAfter time.Time, configObj config.Config) ([]*string, error) { svc := cloudwatchlogs.New(session) - output, err := svc.DescribeLogGroups(&cloudwatchlogs.DescribeLogGroupsInput{}) + allLogGroups := []*string{} + err := svc.DescribeLogGroupsPages( + &cloudwatchlogs.DescribeLogGroupsInput{}, + func(page *cloudwatchlogs.DescribeLogGroupsOutput, lastPage bool) bool { + for _, logGroup := range page.LogGroups { + if shouldIncludeCloudWatchLogGroup(logGroup, excludeAfter, configObj) { + allLogGroups = append(allLogGroups, logGroup.LogGroupName) + } + } + return !lastPage + }, + ) if err != nil { return nil, errors.WithStackTrace(err) } + return allLogGroups, nil +} + +func shouldIncludeCloudWatchLogGroup(logGroup *cloudwatchlogs.LogGroup, excludeAfter time.Time, configObj config.Config) bool { + if logGroup == nil { + return false + } - var names []*string - for _, loggroup := range output.LogGroups { - names = append(names, loggroup.LogGroupName) + if logGroup.CreationTime != nil { + // Convert milliseconds since epoch to time.Time object + creationTime := time.Unix(0, aws.Int64Value(logGroup.CreationTime)*int64(time.Millisecond)) + if excludeAfter.Before(creationTime) { + return false + } } - return names, nil + return config.ShouldInclude( + aws.StringValue(logGroup.LogGroupName), + configObj.CloudWatchLogGroup.IncludeRule.NamesRegExp, + configObj.CloudWatchLogGroup.ExcludeRule.NamesRegExp, + ) } func nukeAllCloudWatchLogGroups(session *session.Session, identifiers []*string) error { + region := aws.StringValue(session.Config.Region) svc := cloudwatchlogs.New(session) if len(identifiers) == 0 { @@ -31,24 +64,71 @@ func nukeAllCloudWatchLogGroups(session *session.Session, identifiers []*string) return nil } - logging.Logger.Infof("Deleting all CloudWatch Log Groups in region %s", *session.Config.Region) + // NOTE: we don't need to do pagination here, because the pagination is handled by the caller to this function, + // based on CloudWatchLogGroup.MaxBatchSize, however we add a guard here to warn users when the batching fails and + // has a chance of throttling AWS. Since we concurrently make one call for each identifier, we pick 100 for the + // limit here because many APIs in AWS have a limit of 100 requests per second. + if len(identifiers) > 100 { + logging.Logger.Errorf("Nuking too many CloudWatch LogGroups at once (100): halting to avoid hitting AWS API rate limiting") + return TooManyLogGroupsErr{} + } - var deleteResources = 0 + // There is no bulk delete CloudWatch Log Group API, so we delete the batch of CloudWatch Log Groups concurrently + // using go routines. + logging.Logger.Infof("Deleting CloudWatch Log Groups in region %s", region) + wg := new(sync.WaitGroup) + wg.Add(len(identifiers)) + errChans := make([]chan error, len(identifiers)) + for i, logGroupName := range identifiers { + errChans[i] = make(chan error, 1) + go deleteCloudWatchLogGroupAsync(wg, errChans[i], svc, logGroupName, region) + } + wg.Wait() - for _, name := range identifiers { - _, err := svc.DeleteLogGroup(&cloudwatchlogs.DeleteLogGroupInput{ - LogGroupName: name, - }) - if err != nil { - logging.Logger.Errorf("[Failed] %s", err) - } else { - logging.Logger.Infof("[OK] CloudWatch Log Group %s terminated in %s", *name, *session.Config.Region) - deleteResources++ + // Collect all the errors from the async delete calls into a single error struct. + // NOTE: We ignore OperationAbortedException which is thrown when there is an eventual consistency issue, where + // cloud-nuke picks up a Log Group that is already requested to be deleted. + var allErrs *multierror.Error + for _, errChan := range errChans { + if err := <-errChan; err != nil { + if awsErr, ok := err.(awserr.Error); ok && awsErr.Code() != "OperationAbortedException" { + allErrs = multierror.Append(allErrs, err) + } } + } + finalErr := allErrs.ErrorOrNil() + if finalErr != nil { + return errors.WithStackTrace(finalErr) + } + return nil +} + +// deleteCloudWatchLogGroupAsync deletes the provided Log Group asynchronously in a goroutine, using wait groups for +// concurrency control and a return channel for errors. +func deleteCloudWatchLogGroupAsync( + wg *sync.WaitGroup, + errChan chan error, + svc *cloudwatchlogs.CloudWatchLogs, + logGroupName *string, + region string, +) { + defer wg.Done() + input := &cloudwatchlogs.DeleteLogGroupInput{LogGroupName: logGroupName} + _, err := svc.DeleteLogGroup(input) + errChan <- err + logGroupNameStr := aws.StringValue(logGroupName) + if err == nil { + logging.Logger.Infof("[OK] CloudWatch Log Group %s deleted in %s", logGroupNameStr, region) + } else { + logging.Logger.Errorf("[Failed] Error deleting CloudWatch Log Group %s in %s: %s", logGroupNameStr, region, err) } +} - logging.Logger.Infof("[OK] %d CloudWatch Log Group(s) terminated in %s", deleteResources, *session.Config.Region) +// Custom errors - return nil +type TooManyLogGroupsErr struct{} + +func (err TooManyLogGroupsErr) Error() string { + return "Too many LogGroups requested at once." } diff --git a/aws/cloudwatch_loggroup_test.go b/aws/cloudwatch_loggroup_test.go index 9999a487..3429ac71 100644 --- a/aws/cloudwatch_loggroup_test.go +++ b/aws/cloudwatch_loggroup_test.go @@ -9,6 +9,7 @@ import ( "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/cloudwatchlogs" + "github.com/gruntwork-io/cloud-nuke/config" "github.com/gruntwork-io/cloud-nuke/util" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -25,14 +26,39 @@ func TestListCloudWatchLogGroups(t *testing.T) { svc := cloudwatchlogs.New(session) - lgName := createCloudWatchLogGroup(t, svc, region) + lgName := createCloudWatchLogGroup(t, svc) defer deleteCloudWatchLogGroup(t, svc, lgName, true) - lgNames, err := getAllCloudWatchLogGroups(session, region) + lgNames, err := getAllCloudWatchLogGroups(session, region, time.Now(), config.Config{}) require.NoError(t, err) assert.Contains(t, aws.StringValueSlice(lgNames), aws.StringValue(lgName)) } +func TestTimeFilterExclusionNewlyCreatedCloudWatchLogGroup(t *testing.T) { + t.Parallel() + + region, err := getRandomRegion() + require.NoError(t, err) + + session, err := session.NewSession(&aws.Config{Region: aws.String(region)}) + require.NoError(t, err) + svc := cloudwatchlogs.New(session) + + lgName := createCloudWatchLogGroup(t, svc) + defer deleteCloudWatchLogGroup(t, svc, lgName, true) + + // Assert CloudWatch Dashboard is picked up without filters + lgNames, err := getAllCloudWatchLogGroups(session, region, time.Now(), config.Config{}) + require.NoError(t, err) + assert.Contains(t, aws.StringValueSlice(lgNames), aws.StringValue(lgName)) + + // Assert user doesn't appear when we look at users older than 1 Hour + olderThan := time.Now().Add(-1 * time.Hour) + lgNamesOlder, err := getAllCloudWatchLogGroups(session, region, olderThan, config.Config{}) + require.NoError(t, err) + assert.NotContains(t, aws.StringValueSlice(lgNamesOlder), aws.StringValue(lgName)) +} + func TestNukeCloudWatchLogGroupOne(t *testing.T) { t.Parallel() @@ -44,7 +70,7 @@ func TestNukeCloudWatchLogGroupOne(t *testing.T) { svc := cloudwatchlogs.New(session) // We ignore errors in the delete call here, because it is intended to be a stop gap in case there is a bug in nuke. - lgName := createCloudWatchLogGroup(t, svc, region) + lgName := createCloudWatchLogGroup(t, svc) defer deleteCloudWatchLogGroup(t, svc, lgName, false) identifiers := []*string{lgName} @@ -70,7 +96,7 @@ func TestNukeCloudWatchLogGroupMoreThanOne(t *testing.T) { lgNames := []*string{} for i := 0; i < 3; i++ { // We ignore errors in the delete call here, because it is intended to be a stop gap in case there is a bug in nuke. - lgName := createCloudWatchLogGroup(t, svc, region) + lgName := createCloudWatchLogGroup(t, svc) defer deleteCloudWatchLogGroup(t, svc, lgName, false) lgNames = append(lgNames, lgName) } @@ -84,7 +110,7 @@ func TestNukeCloudWatchLogGroupMoreThanOne(t *testing.T) { assertCloudWatchLogGroupsDeleted(t, svc, lgNames) } -func createCloudWatchLogGroup(t *testing.T, svc *cloudwatchlogs.CloudWatchLogs, region string) *string { +func createCloudWatchLogGroup(t *testing.T, svc *cloudwatchlogs.CloudWatchLogs) *string { uniqueID := util.UniqueID() name := fmt.Sprintf("cloud-nuke-test-%s", strings.ToLower(uniqueID)) diff --git a/aws/cloudwatch_loggroup_types.go b/aws/cloudwatch_loggroup_types.go index c38f0601..525d4f67 100644 --- a/aws/cloudwatch_loggroup_types.go +++ b/aws/cloudwatch_loggroup_types.go @@ -22,8 +22,10 @@ func (r CloudWatchLogGroups) ResourceIdentifiers() []string { } func (r CloudWatchLogGroups) MaxBatchSize() int { - // Tentative batch size to ensure AWS doesn't throttle - return 200 + // Tentative batch size to ensure AWS doesn't throttle. Note that CloudWatch Logs does not support bulk delete, so + // we will be deleting this many in parallel using go routines. We pick 35 here, which is half of what the AWS web + // console will do. We pick a conservative number here to avoid hitting AWS API rate limits. + return 35 } // Nuke - nuke 'em all!!! diff --git a/config/config.go b/config/config.go index b1867537..a51a02bc 100644 --- a/config/config.go +++ b/config/config.go @@ -26,6 +26,7 @@ type Config struct { Elasticache ResourceType `yaml:"Elasticache"` VPC ResourceType `yaml:"VPC"` OIDCProvider ResourceType `yaml:"OIDCProvider"` + CloudWatchLogGroup ResourceType `yaml:"CloudWatchLogGroup"` } type ResourceType struct { diff --git a/config/config_test.go b/config/config_test.go index cf9949ab..61b62b16 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -27,6 +27,7 @@ func emptyConfig() *Config { ResourceType{FilterRule{}, FilterRule{}}, ResourceType{FilterRule{}, FilterRule{}}, ResourceType{FilterRule{}, FilterRule{}}, + ResourceType{FilterRule{}, FilterRule{}}, } }