Skip to content

Commit

Permalink
Update cloudwatch nuking to support filtering and concurrent deletes
Browse files Browse the repository at this point in the history
  • Loading branch information
yorinasub17 committed Mar 10, 2022
1 parent 00ccb16 commit 32bc385
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 49 deletions.
45 changes: 45 additions & 0 deletions .circleci/nuke_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,48 @@ OIDCProvider:
names_regex:
# We have an active OIDC Provider used by github actions
- ".*token.actions.githubusercontent.com.*"

CloudWatchLogGroup:
# Use an allow list instead of block list because we haven't done the due diligence yet to figure out what log groups
# we need to keep. This is hard to do in the current scenario as we have thousands of log groups in the accounts.
include:
names_regex:
- "/aws/containerinsights/eks-cluster-[a-zA-Z0-9]{6}/.*"
- "/aws/containerinsights/eks-service-catalog-[a-zA-Z0-9]{6}/.*"
- "/aws/ecs/containerinsights/[a-zA-Z0-9]{6}-cluster/.*"
- "/aws/ecs/containerinsights/[a-zA-Z0-9]{6}-ecs-cluster/.*"
- "/aws/ecs/containerinsights/Test-cluster[a-zA-Z0-9]{6}/.*"
- "/aws/eks/eks-cluster-[a-zA-Z0-9]{6}.*"
- "/aws/eks/eks-service-catalog-[a-zA-Z0-9]{6}.*"
- "/aws/lambda/Test.*"
- "/aws/lambda/[a-zA-Z0-9]{6}-ecs-deploy-runner.*"
- "/aws/lambda/ecs-deploy-runner-[a-zA-Z0-9]{6}-invoker$"
- "/aws/lambda/es-cluster-[a-zA-Z0-9]{6}.*"
- "/aws/lambda/jenkins-[a-zA-Z0-9]{6}.*"
- "/aws/lambda/jenkins[a-zA-Z0-9]{6}-0-backup$"
- "/aws/lambda/test-aurora-[a-zA-Z0-9]{6}.*"
- "/aws/lambda/[a-zA-Z0-9]{6}-handler$"
- "/aws/lambda/test[a-zA-Z0-9]{6}.*"
- "/aws/lambda/lambda-[a-zA-Z0-9]{6}$"
- "/aws/lambda/us-east-1\\.[a-zA-Z0-9]{6}cloudfront.*"
- "/aws/lambda/cleanup-expired-iam-certs-[a-zA-Z0-9]{6}$"
- "/aws/lambda/create-snapshot-example-(aurora|mysql)$"
- "/aws/lambda/HoustonExpress[a-zA-Z0-9]{6}.*"
- "/aws/rds/cluster/test[a-zA-Z0-9]{6}.*"
- "/aws/rds/cluster/test-aurora-[a-zA-Z0-9]{6}.*"
- "eks-service-catalog-[a-zA-Z0-9]{6}$"
- "eks-cluster-[a-zA-Z0-9]{6}-container-logs$"
- "es-[a-zA-Z0-9]{6}-lg$"
- "TestCloudWatchLogAggregation.*"
- "API-Gateway-Execution-Logs_.*/example$"
- "asg-[a-zA-Z0-9]{6}$"
- "^[a-zA-Z0-9]{6}-service$"
- "^[a-zA-Z0-9]{6}-ecs-deploy-runner-[a-zA-Z0-9]{6}$"
- "ecs-deploy-runner-[a-zA-Z0-9]{6}$"
- "^[a-zA-Z0-9]{6}(stage|prod)-ec2-syslog$"
- "Ubuntu1804-[a-zA-Z0-9]{6}-logs$"
- "bastion-host-[a-zA-Z0-9]{6}$"
- "ec2-instance-[a-zA-Z0-9]{6}$"
- "jenkins-[a-zA-Z0-9]{6}$"
- "openvpn-server-[a-zA-Z0-9]{6}_log_group$"
- "vpc-test-[a-zA-Z0-9]{6}.*"
48 changes: 26 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,9 @@ The following resources support the Config file:
- IAM OpenID Connect Providers
- Resource type: `oidcprovider`
- Config key: `OIDCProvider`
- CloudWatch LogGroups
- Resource type: `cloudwatch-loggroup`
- Config key: `CloudWatchLogGroup`
- KMS customer keys
- Resource type: `kmscustomerkeys`

Expand Down Expand Up @@ -288,28 +291,29 @@ Be careful when nuking and append the `--dry-run` option if you're unsure. Even

To find out what we options are supported in the config file today, consult this table. Resource types at the top level of the file that are supported are listed here.

| resource type | names | names_regex | tags | tags_regex |
|--------------------|-------|-------------|------|------------|
| s3 | none | ✅ | none | none |
| iam | none | ✅ | none | none |
| ecsserv | none | ✅ | none | none |
| ecscluster | none | ✅ | none | none |
| secretsmanager | none | ✅ | none | none |
| nat-gateway | none | ✅ | none | none |
| accessanalyzer | none | ✅ | none | none |
| dynamodb | none | ✅ | none | none |
| ebs | none | ✅ | none | none |
| lambda | none | ✅ | none | none |
| elbv2 | none | ✅ | none | none |
| ecs | none | ✅ | none | none |
| elasticache | none | ✅ | none | none |
| vpc | none | ✅ | none | none |
| oidcprovider | none | ✅ | none | none |
| kmscustomerkeys | none | ✅ | none | none |
| acmpca | none | none | none | none |
| ec2 instance | none | none | none | none |
| iam role | none | none | none | none |
| ... (more to come) | none | none | none | none |
| resource type | names | names_regex | tags | tags_regex |
|---------------------|-------|-------------|------|------------|
| s3 | none | ✅ | none | none |
| iam | none | ✅ | none | none |
| ecsserv | none | ✅ | none | none |
| ecscluster | none | ✅ | none | none |
| secretsmanager | none | ✅ | none | none |
| nat-gateway | none | ✅ | none | none |
| accessanalyzer | none | ✅ | none | none |
| dynamodb | none | ✅ | none | none |
| ebs | none | ✅ | none | none |
| lambda | none | ✅ | none | none |
| elbv2 | none | ✅ | none | none |
| ecs | none | ✅ | none | none |
| elasticache | none | ✅ | none | none |
| vpc | none | ✅ | none | none |
| oidcprovider | none | ✅ | none | none |
| cloudwatch-loggroup | none | ✅ | none | none |
| kmscustomerkeys | none | ✅ | none | none |
| acmpca | none | none | none | none |
| ec2 instance | none | none | none | none |
| iam role | none | none | none | none |
| ... (more to come) | none | none | none | none |



Expand Down
2 changes: 1 addition & 1 deletion aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ func GetAllResources(targetRegions []string, excludeAfter time.Time, resourceTyp
// CloudWatchLogGroup
cloudwatchLogGroups := CloudWatchLogGroups{}
if IsNukeable(cloudwatchLogGroups.ResourceName(), resourceTypes) {
lgNames, err := getAllCloudWatchLogGroups(session, region)
lgNames, err := getAllCloudWatchLogGroups(session, region, excludeAfter, configObj)
if err != nil {
return nil, errors.WithStackTrace(err)
}
Expand Down
118 changes: 99 additions & 19 deletions aws/cloudwatch_loggroup.go
Original file line number Diff line number Diff line change
@@ -1,54 +1,134 @@
package aws

import (
"sync"
"time"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/cloudwatchlogs"
"github.com/gruntwork-io/cloud-nuke/config"
"github.com/gruntwork-io/cloud-nuke/logging"
"github.com/gruntwork-io/go-commons/errors"
"github.com/hashicorp/go-multierror"
)

func getAllCloudWatchLogGroups(session *session.Session, region string) ([]*string, error) {
func getAllCloudWatchLogGroups(session *session.Session, region string, excludeAfter time.Time, configObj config.Config) ([]*string, error) {
svc := cloudwatchlogs.New(session)

output, err := svc.DescribeLogGroups(&cloudwatchlogs.DescribeLogGroupsInput{})
allLogGroups := []*string{}
err := svc.DescribeLogGroupsPages(
&cloudwatchlogs.DescribeLogGroupsInput{},
func(page *cloudwatchlogs.DescribeLogGroupsOutput, lastPage bool) bool {
for _, logGroup := range page.LogGroups {
if shouldIncludeCloudWatchLogGroup(logGroup, excludeAfter, configObj) {
allLogGroups = append(allLogGroups, logGroup.LogGroupName)
}
}
return !lastPage
},
)
if err != nil {
return nil, errors.WithStackTrace(err)
}
return allLogGroups, nil
}

func shouldIncludeCloudWatchLogGroup(logGroup *cloudwatchlogs.LogGroup, excludeAfter time.Time, configObj config.Config) bool {
if logGroup == nil {
return false
}

var names []*string
for _, loggroup := range output.LogGroups {
names = append(names, loggroup.LogGroupName)
if logGroup.CreationTime != nil {
// Convert milliseconds since epoch to time.Time object
creationTime := time.Unix(0, aws.Int64Value(logGroup.CreationTime)*int64(time.Millisecond))
if excludeAfter.Before(creationTime) {
return false
}
}

return names, nil
return config.ShouldInclude(
aws.StringValue(logGroup.LogGroupName),
configObj.CloudWatchLogGroup.IncludeRule.NamesRegExp,
configObj.CloudWatchLogGroup.ExcludeRule.NamesRegExp,
)
}

func nukeAllCloudWatchLogGroups(session *session.Session, identifiers []*string) error {
region := aws.StringValue(session.Config.Region)
svc := cloudwatchlogs.New(session)

if len(identifiers) == 0 {
logging.Logger.Infof("No CloudWatch Log Groups to nuke in region %s", *session.Config.Region)
return nil
}

logging.Logger.Infof("Deleting all CloudWatch Log Groups in region %s", *session.Config.Region)
// NOTE: we don't need to do pagination here, because the pagination is handled by the caller to this function,
// based on CloudWatchLogGroup.MaxBatchSize, however we add a guard here to warn users when the batching fails and
// has a chance of throttling AWS. Since we concurrently make one call for each identifier, we pick 100 for the
// limit here because many APIs in AWS have a limit of 100 requests per second.
if len(identifiers) > 100 {
logging.Logger.Errorf("Nuking too many CloudWatch LogGroups at once (100): halting to avoid hitting AWS API rate limiting")
return TooManyLogGroupsErr{}
}

var deleteResources = 0
// There is no bulk delete CloudWatch Log Group API, so we delete the batch of CloudWatch Log Groups concurrently
// using go routines.
logging.Logger.Infof("Deleting CloudWatch Log Groups in region %s", region)
wg := new(sync.WaitGroup)
wg.Add(len(identifiers))
errChans := make([]chan error, len(identifiers))
for i, logGroupName := range identifiers {
errChans[i] = make(chan error, 1)
go deleteCloudWatchLogGroupAsync(wg, errChans[i], svc, logGroupName, region)
}
wg.Wait()

for _, name := range identifiers {
_, err := svc.DeleteLogGroup(&cloudwatchlogs.DeleteLogGroupInput{
LogGroupName: name,
})
if err != nil {
logging.Logger.Errorf("[Failed] %s", err)
} else {
logging.Logger.Infof("[OK] CloudWatch Log Group %s terminated in %s", *name, *session.Config.Region)
deleteResources++
// Collect all the errors from the async delete calls into a single error struct.
// NOTE: We ignore OperationAbortedException which is thrown when there is an eventual consistency issue, where
// cloud-nuke picks up a Log Group that is already requested to be deleted.
var allErrs *multierror.Error
for _, errChan := range errChans {
if err := <-errChan; err != nil {
if awsErr, ok := err.(awserr.Error); ok && awsErr.Code() != "OperationAbortedException" {
allErrs = multierror.Append(allErrs, err)
}
}
}
finalErr := allErrs.ErrorOrNil()
if finalErr != nil {
return errors.WithStackTrace(finalErr)
}
return nil
}

// deleteCloudWatchLogGroupAsync deletes the provided Log Group asynchronously in a goroutine, using wait groups for
// concurrency control and a return channel for errors.
func deleteCloudWatchLogGroupAsync(
wg *sync.WaitGroup,
errChan chan error,
svc *cloudwatchlogs.CloudWatchLogs,
logGroupName *string,
region string,
) {
defer wg.Done()
input := &cloudwatchlogs.DeleteLogGroupInput{LogGroupName: logGroupName}
_, err := svc.DeleteLogGroup(input)
errChan <- err

logGroupNameStr := aws.StringValue(logGroupName)
if err == nil {
logging.Logger.Infof("[OK] CloudWatch Log Group %s deleted in %s", logGroupNameStr, region)
} else {
logging.Logger.Errorf("[Failed] Error deleting CloudWatch Log Group %s in %s: %s", logGroupNameStr, region, err)
}
}

logging.Logger.Infof("[OK] %d CloudWatch Log Group(s) terminated in %s", deleteResources, *session.Config.Region)
// Custom errors

return nil
type TooManyLogGroupsErr struct{}

func (err TooManyLogGroupsErr) Error() string {
return "Too many LogGroups requested at once."
}
36 changes: 31 additions & 5 deletions aws/cloudwatch_loggroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/cloudwatchlogs"
"github.com/gruntwork-io/cloud-nuke/config"
"github.com/gruntwork-io/cloud-nuke/util"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand All @@ -25,14 +26,39 @@ func TestListCloudWatchLogGroups(t *testing.T) {

svc := cloudwatchlogs.New(session)

lgName := createCloudWatchLogGroup(t, svc, region)
lgName := createCloudWatchLogGroup(t, svc)
defer deleteCloudWatchLogGroup(t, svc, lgName, true)

lgNames, err := getAllCloudWatchLogGroups(session, region)
lgNames, err := getAllCloudWatchLogGroups(session, region, time.Now(), config.Config{})
require.NoError(t, err)
assert.Contains(t, aws.StringValueSlice(lgNames), aws.StringValue(lgName))
}

func TestTimeFilterExclusionNewlyCreatedCloudWatchLogGroup(t *testing.T) {
t.Parallel()

region, err := getRandomRegion()
require.NoError(t, err)

session, err := session.NewSession(&aws.Config{Region: aws.String(region)})
require.NoError(t, err)
svc := cloudwatchlogs.New(session)

lgName := createCloudWatchLogGroup(t, svc)
defer deleteCloudWatchLogGroup(t, svc, lgName, true)

// Assert CloudWatch Dashboard is picked up without filters
lgNames, err := getAllCloudWatchLogGroups(session, region, time.Now(), config.Config{})
require.NoError(t, err)
assert.Contains(t, aws.StringValueSlice(lgNames), aws.StringValue(lgName))

// Assert user doesn't appear when we look at users older than 1 Hour
olderThan := time.Now().Add(-1 * time.Hour)
lgNamesOlder, err := getAllCloudWatchLogGroups(session, region, olderThan, config.Config{})
require.NoError(t, err)
assert.NotContains(t, aws.StringValueSlice(lgNamesOlder), aws.StringValue(lgName))
}

func TestNukeCloudWatchLogGroupOne(t *testing.T) {
t.Parallel()

Expand All @@ -44,7 +70,7 @@ func TestNukeCloudWatchLogGroupOne(t *testing.T) {
svc := cloudwatchlogs.New(session)

// We ignore errors in the delete call here, because it is intended to be a stop gap in case there is a bug in nuke.
lgName := createCloudWatchLogGroup(t, svc, region)
lgName := createCloudWatchLogGroup(t, svc)
defer deleteCloudWatchLogGroup(t, svc, lgName, false)
identifiers := []*string{lgName}

Expand All @@ -70,7 +96,7 @@ func TestNukeCloudWatchLogGroupMoreThanOne(t *testing.T) {
lgNames := []*string{}
for i := 0; i < 3; i++ {
// We ignore errors in the delete call here, because it is intended to be a stop gap in case there is a bug in nuke.
lgName := createCloudWatchLogGroup(t, svc, region)
lgName := createCloudWatchLogGroup(t, svc)
defer deleteCloudWatchLogGroup(t, svc, lgName, false)
lgNames = append(lgNames, lgName)
}
Expand All @@ -84,7 +110,7 @@ func TestNukeCloudWatchLogGroupMoreThanOne(t *testing.T) {
assertCloudWatchLogGroupsDeleted(t, svc, lgNames)
}

func createCloudWatchLogGroup(t *testing.T, svc *cloudwatchlogs.CloudWatchLogs, region string) *string {
func createCloudWatchLogGroup(t *testing.T, svc *cloudwatchlogs.CloudWatchLogs) *string {
uniqueID := util.UniqueID()
name := fmt.Sprintf("cloud-nuke-test-%s", strings.ToLower(uniqueID))

Expand Down
6 changes: 4 additions & 2 deletions aws/cloudwatch_loggroup_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@ func (r CloudWatchLogGroups) ResourceIdentifiers() []string {
}

func (r CloudWatchLogGroups) MaxBatchSize() int {
// Tentative batch size to ensure AWS doesn't throttle
return 200
// Tentative batch size to ensure AWS doesn't throttle. Note that CloudWatch Logs does not support bulk delete, so
// we will be deleting this many in parallel using go routines. We pick 35 here, which is half of what the AWS web
// console will do. We pick a conservative number here to avoid hitting AWS API rate limits.
return 35
}

// Nuke - nuke 'em all!!!
Expand Down
1 change: 1 addition & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type Config struct {
Elasticache ResourceType `yaml:"Elasticache"`
VPC ResourceType `yaml:"VPC"`
OIDCProvider ResourceType `yaml:"OIDCProvider"`
CloudWatchLogGroup ResourceType `yaml:"CloudWatchLogGroup"`
}

type ResourceType struct {
Expand Down
1 change: 1 addition & 0 deletions config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ func emptyConfig() *Config {
ResourceType{FilterRule{}, FilterRule{}},
ResourceType{FilterRule{}, FilterRule{}},
ResourceType{FilterRule{}, FilterRule{}},
ResourceType{FilterRule{}, FilterRule{}},
}
}

Expand Down

0 comments on commit 32bc385

Please sign in to comment.