Skip to content

Commit

Permalink
Merge pull request #30114 from hashicorp/b-sg-dependent-obj-retry
Browse files Browse the repository at this point in the history
security_group: Destroying takes forever improvements
  • Loading branch information
YakDriver authored Mar 20, 2023
2 parents 6233361 + 229a971 commit ddde12c
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 15 deletions.
3 changes: 3 additions & 0 deletions .changelog/30114.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
resource/aws_security_group: Improve respect for delete timeout set by user and retry of certain errors
```
8 changes: 8 additions & 0 deletions GNUmakefile
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,10 @@ testacc: fmtcheck
fi
TF_ACC=1 $(GO_VER) test ./$(PKG_NAME)/... -v -count $(TEST_COUNT) -parallel $(ACCTEST_PARALLELISM) $(RUNARGS) $(TESTARGS) -timeout $(ACCTEST_TIMEOUT)

testacc-short: fmtcheck
@echo "Running acceptance tests with -short flag"
TF_ACC=1 $(GO_VER) test ./$(PKG_NAME)/... -v -short -count $(TEST_COUNT) -parallel $(ACCTEST_PARALLELISM) $(RUNARGS) $(TESTARGS) -timeout $(ACCTEST_TIMEOUT)

testacc-lint:
@echo "Checking acceptance tests with terrafmt"
find $(SVC_DIR) -type f -name '*_test.go' \
Expand Down Expand Up @@ -307,6 +311,8 @@ tools:
cd .ci/tools && $(GO_VER) install github.com/rhysd/actionlint/cmd/actionlint
cd .ci/tools && $(GO_VER) install mvdan.cc/gofumpt

ts: testacc-short

website-link-check:
@.ci/scripts/markdown-link-check.sh

Expand Down Expand Up @@ -366,10 +372,12 @@ yamllint:
test \
test-compile \
testacc \
testaccs \
testacc-lint \
testacc-lint-fix \
tfsdk2fw \
tools \
ts \
website-link-check \
website-link-check-ghrc \
website-lint \
Expand Down
12 changes: 9 additions & 3 deletions internal/service/ec2/vpc_security_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -380,10 +380,16 @@ func resourceSecurityGroupDelete(ctx context.Context, d *schema.ResourceData, me
}
}

firstShortRetry := 1 * time.Minute
remainingRetry := d.Timeout(schema.TimeoutDelete) - firstShortRetry
if d.Timeout(schema.TimeoutDelete) < 1*time.Minute {
remainingRetry = 30 * time.Second
}

log.Printf("[DEBUG] Deleting Security Group: %s", d.Id())
_, err := tfresource.RetryWhenAWSErrCodeEquals(
ctx,
2*time.Minute, // short initial attempt followed by full length attempt
firstShortRetry, // short initial attempt followed by full length attempt
func() (interface{}, error) {
return conn.DeleteSecurityGroupWithContext(ctx, &ec2.DeleteSecurityGroupInput{
GroupId: aws.String(d.Id()),
Expand All @@ -392,7 +398,7 @@ func resourceSecurityGroupDelete(ctx context.Context, d *schema.ResourceData, me
errCodeDependencyViolation, errCodeInvalidGroupInUse,
)

if tfawserr.ErrCodeEquals(err, errCodeDependencyViolation) {
if tfawserr.ErrCodeEquals(err, errCodeDependencyViolation) || tfawserr.ErrCodeEquals(err, errCodeInvalidGroupInUse) {
if v := d.Get("revoke_rules_on_delete").(bool); v {
err := forceRevokeSecurityGroupRules(ctx, conn, d.Id(), true)

Expand All @@ -403,7 +409,7 @@ func resourceSecurityGroupDelete(ctx context.Context, d *schema.ResourceData, me

_, err = tfresource.RetryWhenAWSErrCodeEquals(
ctx,
d.Timeout(schema.TimeoutDelete),
remainingRetry,
func() (interface{}, error) {
return conn.DeleteSecurityGroupWithContext(ctx, &ec2.DeleteSecurityGroupInput{
GroupId: aws.String(d.Id()),
Expand Down
8 changes: 8 additions & 0 deletions internal/service/ec2/vpc_security_group_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1648,6 +1648,10 @@ func TestAccVPCSecurityGroup_forceRevokeRulesTrue(t *testing.T) {

func TestAccVPCSecurityGroup_forceRevokeRulesFalse(t *testing.T) {
ctx := acctest.Context(t)
if testing.Short() {
t.Skip("skipping long-running test in short mode")
}

var primary ec2.SecurityGroup
var secondary ec2.SecurityGroup
rName := sdkacctest.RandomWithPrefix(acctest.ResourcePrefix)
Expand Down Expand Up @@ -2807,6 +2811,10 @@ func TestAccVPCSecurityGroup_rulesDropOnError(t *testing.T) {
// a rule in another SG, it could not previously be deleted.
func TestAccVPCSecurityGroup_emrDependencyViolation(t *testing.T) {
ctx := acctest.Context(t)
if testing.Short() {
t.Skip("skipping long-running test in short mode")
}

var group ec2.SecurityGroup
resourceName := "aws_security_group.allow_access"
rName := sdkacctest.RandomWithPrefix(acctest.ResourcePrefix)
Expand Down
2 changes: 1 addition & 1 deletion website/docs/r/codestarconnections_host.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ The following arguments are supported:

A `vpc_configuration` block supports the following arguments:

* `security_group_ids` - (Required) he ID of the security group or security groups associated with the Amazon VPC connected to the infrastructure where your provider type is installed.
* `security_group_ids` - (Required) ID of the security group or security groups associated with the Amazon VPC connected to the infrastructure where your provider type is installed.
* `subnet_ids` - (Required) The ID of the subnet or subnets associated with the Amazon VPC connected to the infrastructure where your provider type is installed.
* `tls_certificate` - (Optional) The value of the Transport Layer Security (TLS) certificate associated with the infrastructure where your provider type is installed.
* `vpc_id` - (Required) The ID of the Amazon VPC connected to the infrastructure where your provider type is installed.
Expand Down
118 changes: 107 additions & 11 deletions website/docs/r/security_group.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,7 @@ description: |-

Provides a security group resource.

~> **NOTE on Security Groups and Security Group Rules:** Terraform currently provides a Security Group resource with `ingress` and `egress` rules defined in-line and a [Security Group Rule resource](security_group_rule.html) which manages one or more `ingress` or
`egress` rules. Both of these resource were added before AWS assigned a [security group rule unique ID](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/security-group-rules.html), and they do not work well in all scenarios using the`description` and `tags` attributes, which rely on the unique ID.
The [`aws_vpc_security_group_egress_rule`](vpc_security_group_egress_rule.html) and [`aws_vpc_security_group_ingress_rule`](vpc_security_group_ingress_rule.html) resources have been added to address these limitations and should be used for all new security group rules.
You should not use the `aws_vpc_security_group_egress_rule` and `aws_vpc_security_group_ingress_rule` resources in conjunction with an `aws_security_group` resource with in-line rules or with `aws_security_group_rule` resources defined for the same Security Group, as rule conflicts may occur and rules will be overwritten.
~> **NOTE on Security Groups and Security Group Rules:** Terraform currently provides a Security Group resource with `ingress` and `egress` rules defined in-line and a [Security Group Rule resource](security_group_rule.html) which manages one or more `ingress` or `egress` rules. Both of these resource were added before AWS assigned a [security group rule unique ID](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/security-group-rules.html), and they do not work well in all scenarios using the`description` and `tags` attributes, which rely on the unique ID. The [`aws_vpc_security_group_egress_rule`](vpc_security_group_egress_rule.html) and [`aws_vpc_security_group_ingress_rule`](vpc_security_group_ingress_rule.html) resources have been added to address these limitations and should be used for all new security group rules. You should not use the `aws_vpc_security_group_egress_rule` and `aws_vpc_security_group_ingress_rule` resources in conjunction with an `aws_security_group` resource with in-line rules or with `aws_security_group_rule` resources defined for the same Security Group, as rule conflicts may occur and rules will be overwritten.

~> **NOTE:** Referencing Security Groups across VPC peering has certain restrictions. More information is available in the [VPC Peering User Guide](https://docs.aws.amazon.com/vpc/latest/peering/vpc-peering-security-groups.html).

Expand Down Expand Up @@ -96,24 +93,124 @@ resource "aws_vpc_endpoint" "my_endpoint" {

You can also find a specific Prefix List using the `aws_prefix_list` data source.

### Change of name or name-prefix value
### Recreating a Security Group

Security Group's Name [cannot be edited after the resource is created](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/working-with-security-groups.html#creating-security-group). In fact, the `name` and `name-prefix` arguments force the creation of a new Security Group resource when they change value. In that case, Terraform first deletes the existing Security Group resource and then it creates a new one. If the existing Security Group is associated to a Network Interface resource, the deletion cannot complete. The reason is that Network Interface resources cannot be left with no Security Group attached and the new one is not yet available at that point.
A simple security group `name` change "forces new" the security group--Terraform destroys the security group and creates a new one. (Likewise, `description`, `name_prefix`, or `vpc_id` [cannot be changed](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/working-with-security-groups.html#creating-security-group).) Attempting to recreate the security group leads to a variety of complications depending on how it is used.

You must invert the default behavior of Terraform. That is, first the new Security Group resource must be created, then associated to possible Network Interface resources and finally the old Security Group can be detached and deleted. To force this behavior, you must set the [create_before_destroy](https://www.terraform.io/language/meta-arguments/lifecycle#create_before_destroy) property:
Security groups are generally associated with other resources--**more than 100** AWS Provider resources reference security groups. Referencing a resource from another resource creates a one-way dependency. For example, if you create an EC2 `aws_instance` that has a `vpc_security_group_ids` argument that refers to an `aws_security_group` resource, the `aws_security_group` is a dependent of the `aws_instance`. Because of this, Terraform will create the security group first so that it can then be associated with the EC2 instance.

However, the dependency relationship actually goes both directions causing the _Security Group Deletion Problem_. AWS does not allow you to delete the security group associated with another resource (_e.g._, the `aws_instance`).

Terraform does [not model bi-directional dependencies](https://developer.hashicorp.com/terraform/internals/graph) like this, but, even if it did, simply knowing the dependency situation would not be enough to solve it. For example, some resources must always have an associated security group while others don't need to. In addition, when the `aws_security_group` resource attempts to recreate, it receives a dependent object error, which does not provide information on whether the dependent object is a security group rule or, for example, an associated EC2 instance. Within Terraform, the associated resource (_e.g._, `aws_instance`) does not receive an error when the `aws_security_group` is trying to recreate even though that is where changes to the associated resource would need to take place (_e.g._, removing the security group association).

Despite these sticky problems, below are some ways to improve your experience when you find it necessary to recreate a security group.

#### `create_before_destroy`

(This example is one approach to [recreating security groups](#recreating-a-security-group). For more information on the challenges and the _Security Group Deletion Problem_, see [the section above](#recreating-a-security-group).)

Normally, Terraform first deletes the existing security group resource and then creates a new one. When a security group is associated with a resource, the delete won't succeed. You can invert the default behavior using the [`create_before_destroy` meta argument](https://www.terraform.io/language/meta-arguments/lifecycle#create_before_destroy):

```terraform
resource "aws_security_group" "sg_with_changeable_name" {
resource "aws_security_group" "example" {
name = "changeable-name"
# ... other configuration ...
lifecycle {
# Necessary if changing 'name' or 'name_prefix' properties.
create_before_destroy = true
}
}
```

#### `replace_triggered_by`

(This example is one approach to [recreating security groups](#recreating-a-security-group). For more information on the challenges and the _Security Group Deletion Problem_, see [the section above](#recreating-a-security-group).)

To replace a resource when a security group changes, use the [`replace_triggered_by` meta argument](https://www.terraform.io/language/meta-arguments/lifecycle#replace_triggered_by). Note that in this example, the `aws_instance` will be destroyed and created again when the `aws_security_group` changes.

```terraform
resource "aws_security_group" "example" {
name = "sg"
# ... other configuration ...
}
resource "aws_instance" "example" {
instance_type = "t3.small"
# ... other configuration ...
vpc_security_group_ids = [aws_security_group.test.id]
lifecycle {
# Reference the security group as a whole or individual attributes like `name`
replace_triggered_by = [aws_security_group.example]
}
}
```

#### Shorter timeout

(This example is one approach to [recreating security groups](#recreating-a-security-group). For more information on the challenges and the _Security Group Deletion Problem_, see [the section above](#recreating-a-security-group).)

If destroying a security group takes a long time, it may be because Terraform cannot distinguish between a dependent object (_e.g._, a security group rule or EC2 instance) that is _in the process of being deleted_ and one that is not. In other words, it may be waiting for a train that isn't scheduled to arrive. To fail faster, shorten the `delete` [timeout](https://developer.hashicorp.com/terraform/language/resources/syntax#operation-timeouts) from the default timeout:

```terraform
resource "aws_security_group" "example" {
name = "izizavle"
# ... other configuration ...
timeouts {
delete = "2m"
}
}
```

#### Provisioners

(This example is one approach to [recreating security groups](#recreating-a-security-group). For more information on the challenges and the _Security Group Deletion Problem_, see [the section above](#recreating-a-security-group).)

**DISCLAIMER:** We **_HIGHLY_** recommend using one of the above approaches and _NOT_ using local provisioners. Provisioners, like the one shown below, should be considered a **last resort** since they are _not readable_, _require skills outside standard Terraform configuration_, are _error prone_ and _difficult to maintain_, are not compatible with cloud environments and upgrade tools, require AWS CLI installation, and are subject to AWS CLI and Terraform changes outside the AWS Provider.

```terraform
data "aws_security_group" "default" {
name = "default"
# ... other configuration ...
}
resource "aws_security_group" "example" {
name = "sg"
# ... other configuration ...
# The downstream resource must have at least one SG attached, therefore we
# attach the default SG of the VPC temporarily and remove it later on
provisioner "local-exec" {
when = destroy
command = <<DOC
ENDPOINT_ID=`aws ec2 describe-vpc-endpoints --filters "Name=tag:Name,Values=${self.tags.workaround1}" --query "VpcEndpoints[0].VpcEndpointId" --output text` &&
aws ec2 modify-vpc-endpoint --vpc-endpoint-id $${ENDPOINT_ID} --add-security-group-ids ${self.tags.workaround2} --remove-security-group-ids ${self.id}
DOC
on_failure = continue
}
tags = {
workaround1 = "tagged-name"
workaround2 = data.aws_security_group.default.id
}
}
resource "null_resource" "example" {
provisioner "local-exec" {
command = <<DOC
aws ec2 modify-vpc-endpoint --vpc-endpoint-id ${aws_vpc_endpoint.example.id} --remove-security-group-ids ${data.aws_security_group.default.id}
DOC
on_failure = continue
}
triggers = {
rerun_upon_change_of = join(",", aws_vpc_endpoint.example.security_group_ids)
}
}
```

## Argument Reference

The following arguments are supported:
Expand All @@ -125,8 +222,7 @@ The following arguments are supported:
* `name` - (Optional, Forces new resource) Name of the security group. If omitted, Terraform will assign a random, unique name.
* `revoke_rules_on_delete` - (Optional) Instruct Terraform to revoke all of the Security Groups attached ingress and egress rules before deleting the rule itself. This is normally not needed, however certain AWS services such as Elastic Map Reduce may automatically add required rules to security groups used with the service, and those rules may contain a cyclic dependency that prevent the security groups from being destroyed without removing the dependency first. Default `false`.
* `tags` - (Optional) Map of tags to assign to the resource. If configured with a provider [`default_tags` configuration block](https://registry.terraform.io/providers/hashicorp/aws/latest/docs#default_tags-configuration-block) present, tags with matching keys will overwrite those defined at the provider-level.
* `vpc_id` - (Optional, Forces new resource) VPC ID.
Defaults to the region's default VPC.
* `vpc_id` - (Optional, Forces new resource) VPC ID. Defaults to the region's default VPC.

### ingress

Expand Down

0 comments on commit ddde12c

Please sign in to comment.