Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROX-13692: create final snapshot for tenant db #994

Merged
merged 12 commits into from
May 4, 2023
3 changes: 2 additions & 1 deletion .github/workflows/rds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ on:
- 'docs/**'
- 'pkg/api/openapi/docs/**'
- 'pkg/api/openapi/.openapi-generator-ignore'
- 'dp-terraform/**'

jobs:
verify-test:
Expand Down Expand Up @@ -67,4 +68,4 @@ jobs:
AWS_AUTH_HELPER: "none"
run: |
./dev/env/scripts/exec_fleetshard_sync.sh make test/rds
timeout-minutes: 35
timeout-minutes: 50
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ test: $(GOTESTSUM_BIN)
# Runs the AWS RDS integration tests.
test/rds: $(GOTESTSUM_BIN)
RUN_RDS_TESTS=true \
$(GOTESTSUM_BIN) --junitfile data/results/rds-integration-tests.xml --format $(GOTESTSUM_FORMAT) -- -p 1 -v -timeout 30m -count=1 \
$(GOTESTSUM_BIN) --junitfile data/results/rds-integration-tests.xml --format $(GOTESTSUM_FORMAT) -- -p 1 -v -timeout 45m -count=1 \
./fleetshard/pkg/central/cloudprovider/awsclient/...
.PHONY: test/rds

Expand Down
16 changes: 12 additions & 4 deletions fleetshard/pkg/central/cloudprovider/awsclient/rds.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,6 @@ func (r *RDS) ensureInstanceDeleted(instanceID string) error {

if instanceStatus != dbDeletingStatus {
glog.Infof("Initiating deprovisioning of RDS database instance %s.", instanceID)
// TODO(ROX-13692): do not skip taking a final DB snapshot
_, err := r.rdsClient.DeleteDBInstance(newDeleteCentralDBInstanceInput(instanceID, true))
if err != nil {
return fmt.Errorf("deleting DB instance: %w", err)
Expand All @@ -187,8 +186,7 @@ func (r *RDS) ensureClusterDeleted(clusterID string) error {

if clusterStatus != dbDeletingStatus {
glog.Infof("Initiating deprovisioning of RDS database cluster %s.", clusterID)
// TODO(ROX-13692): do not skip taking a final DB snapshot
_, err := r.rdsClient.DeleteDBCluster(newDeleteCentralDBClusterInput(clusterID, true))
_, err := r.rdsClient.DeleteDBCluster(newDeleteCentralDBClusterInput(clusterID, false))
if err != nil {
return fmt.Errorf("deleting DB cluster: %w", err)
}
Expand Down Expand Up @@ -366,10 +364,16 @@ func newDeleteCentralDBInstanceInput(instanceID string, skipFinalSnapshot bool)
}

func newDeleteCentralDBClusterInput(clusterID string, skipFinalSnapshot bool) *rds.DeleteDBClusterInput {
return &rds.DeleteDBClusterInput{
input := &rds.DeleteDBClusterInput{
DBClusterIdentifier: aws.String(clusterID),
SkipFinalSnapshot: aws.Bool(skipFinalSnapshot),
}

if !skipFinalSnapshot {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I think we can just remove this skipFinalSnapshot parameter completely.

input.FinalDBSnapshotIdentifier = getFinalSnapshotID(clusterID)
}

return input
}

func newRdsClient(awsConfig config.AWS, auth fleetmanager.Auth) (*rds.RDS, error) {
Expand All @@ -395,6 +399,10 @@ func newRdsClient(awsConfig config.AWS, auth fleetmanager.Auth) (*rds.RDS, error
return rds.New(sess), nil
}

func getFinalSnapshotID(clusterID string) *string {
return aws.String(fmt.Sprintf("%s-%s", clusterID, "final"))
}

type tokenFetcher struct {
auth fleetmanager.Auth
}
Expand Down
53 changes: 46 additions & 7 deletions fleetshard/pkg/central/cloudprovider/awsclient/rds_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (
"github.com/stretchr/testify/require"
)

const awsTimeoutMinutes = 15
const awsTimeoutMinutes = 30

func newTestRDS() (*RDS, error) {
rdsClient, err := newTestRDSClient()
Expand Down Expand Up @@ -47,7 +47,7 @@ func newTestRDSClient() (*rds.RDS, error) {

func waitForClusterToBeDeleted(ctx context.Context, rdsClient *RDS, clusterID string) (bool, error) {
for {
clusterExists, clusterStatus, err := rdsClient.clusterStatus(clusterID)
clusterExists, _, err := rdsClient.clusterStatus(clusterID)
if err != nil {
return false, err
}
Expand All @@ -56,11 +56,6 @@ func waitForClusterToBeDeleted(ctx context.Context, rdsClient *RDS, clusterID st
return true, nil
}

// exit early if cluster is marked as deleting
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code was added to avoid CI flakes, see:
#838

Deleting a DB occasionally takes a very long time, and it doesn't bring much benefit to wait (we're basically testing AWS, not our code).

But if this is really needed to check the snapshots, we'll have to increase the timeouts for this test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The snapshot is not created until a few seconds after the cluster is no longer listed in the RDS API. So unfortunately this is necessary to check that snapshots are created.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which timeout setting are you referring to?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are several, and we need to increase them all (I'd suggest doubling the values):

  • awsTimeoutMinutes = 15 in rds_test.go:20
  • -timeout 30m in Makefile:329
  • timeout-minutes: 35 in the rds.yaml:70

Anyway if we do this, we'll have another long running (and possibly still flaky) test in addition to the E2E one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that this is not ideal. But I don't see a way to improve this right now. I think it is crucial to make sure the final snapshot is created, when DBs are deleted. Another solution could be to test this in the regular e2e suite instead of the RDS suite if you're more comfortable with that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incremented all timeouts by 15 minutes. During my tests in this PR I noticed no failures because of timeouts. Nevertheless I increased it by 15 minutes because at some point the timeout was very close to actually trigger.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The regular e2e suite doesn't test RDS integration at all, so we can't move this there. I didn't know that automatic snapshots are deleted when the DB is deleted, that doesn't sound too good, I'd follow up with AWS on it.

Normally these tests should take around 20 minutes, but there are times when they take much longer. I've already had to adjust them twice because of that. It's OK for now and if they fail in the future we'll increase the timeouts again.

if clusterStatus == dbDeletingStatus {
return true, nil
}

ticker := time.NewTicker(awsRetrySeconds * time.Second)
select {
case <-ticker.C:
Expand All @@ -71,6 +66,37 @@ func waitForClusterToBeDeleted(ctx context.Context, rdsClient *RDS, clusterID st
}
}

func waitForFinalSnapshotToExist(ctx context.Context, rdsClient *RDS, clusterID string) (bool, error) {
vladbologa marked this conversation as resolved.
Show resolved Hide resolved

ticker := time.NewTicker(awsRetrySeconds * time.Second)
for {
select {
case <-ticker.C:
snapshotOut, err := rdsClient.rdsClient.DescribeDBClusterSnapshots(&rds.DescribeDBClusterSnapshotsInput{
DBClusterSnapshotIdentifier: getFinalSnapshotID(clusterID),
})

if err != nil {
if awsErr, ok := err.(awserr.Error); ok {
if awsErr.Code() != rds.ErrCodeDBClusterSnapshotNotFoundFault {
return false, err
}

continue
}
}

if snapshotOut != nil {
return len(snapshotOut.DBClusterSnapshots) == 1, nil
}
case <-ctx.Done():
return false, fmt.Errorf("waiting for final DB snapshot: %w", ctx.Err())
}

}

}

func TestRDSProvisioning(t *testing.T) {
if os.Getenv("RUN_RDS_TESTS") != "true" {
t.Skip("Skip RDS tests. Set RUN_RDS_TESTS=true env variable to enable RDS tests.")
Expand Down Expand Up @@ -136,6 +162,19 @@ func TestRDSProvisioning(t *testing.T) {
clusterDeleted, err := waitForClusterToBeDeleted(deleteCtx, rdsClient, clusterID)
require.NoError(t, err)
assert.True(t, clusterDeleted)

// Always attemt to delete the final snapshot if it exists
defer func() {
_, err := rdsClient.rdsClient.DeleteDBClusterSnapshot(
&rds.DeleteDBClusterSnapshotInput{DBClusterSnapshotIdentifier: getFinalSnapshotID(clusterID)},
)

assert.NoError(t, err)
}()

snapshotExists, err := waitForFinalSnapshotToExist(deleteCtx, rdsClient, clusterID)
vladbologa marked this conversation as resolved.
Show resolved Hide resolved
require.NoError(t, err)
require.True(t, snapshotExists)
}

func TestGetDBConnection(t *testing.T) {
Expand Down