-
Notifications
You must be signed in to change notification settings - Fork 13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ROX-13692: create final snapshot for tenant db #994
Changes from all commits
ea75421
d4b48b7
5c5d39d
2293d52
3d26f7d
cda9636
1a073da
a411a4b
14a3d36
c016838
7f218ad
bdcf920
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,7 @@ import ( | |
"github.com/stretchr/testify/require" | ||
) | ||
|
||
const awsTimeoutMinutes = 15 | ||
const awsTimeoutMinutes = 30 | ||
|
||
func newTestRDS() (*RDS, error) { | ||
rdsClient, err := newTestRDSClient() | ||
|
@@ -47,7 +47,7 @@ func newTestRDSClient() (*rds.RDS, error) { | |
|
||
func waitForClusterToBeDeleted(ctx context.Context, rdsClient *RDS, clusterID string) (bool, error) { | ||
for { | ||
clusterExists, clusterStatus, err := rdsClient.clusterStatus(clusterID) | ||
clusterExists, _, err := rdsClient.clusterStatus(clusterID) | ||
if err != nil { | ||
return false, err | ||
} | ||
|
@@ -56,11 +56,6 @@ func waitForClusterToBeDeleted(ctx context.Context, rdsClient *RDS, clusterID st | |
return true, nil | ||
} | ||
|
||
// exit early if cluster is marked as deleting | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code was added to avoid CI flakes, see: Deleting a DB occasionally takes a very long time, and it doesn't bring much benefit to wait (we're basically testing AWS, not our code). But if this is really needed to check the snapshots, we'll have to increase the timeouts for this test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The snapshot is not created until a few seconds after the cluster is no longer listed in the RDS API. So unfortunately this is necessary to check that snapshots are created. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which timeout setting are you referring to? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are several, and we need to increase them all (I'd suggest doubling the values):
Anyway if we do this, we'll have another long running (and possibly still flaky) test in addition to the E2E one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that this is not ideal. But I don't see a way to improve this right now. I think it is crucial to make sure the final snapshot is created, when DBs are deleted. Another solution could be to test this in the regular e2e suite instead of the RDS suite if you're more comfortable with that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Incremented all timeouts by 15 minutes. During my tests in this PR I noticed no failures because of timeouts. Nevertheless I increased it by 15 minutes because at some point the timeout was very close to actually trigger. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The regular e2e suite doesn't test RDS integration at all, so we can't move this there. I didn't know that automatic snapshots are deleted when the DB is deleted, that doesn't sound too good, I'd follow up with AWS on it. Normally these tests should take around 20 minutes, but there are times when they take much longer. I've already had to adjust them twice because of that. It's OK for now and if they fail in the future we'll increase the timeouts again. |
||
if clusterStatus == dbDeletingStatus { | ||
return true, nil | ||
} | ||
|
||
ticker := time.NewTicker(awsRetrySeconds * time.Second) | ||
select { | ||
case <-ticker.C: | ||
|
@@ -71,6 +66,37 @@ func waitForClusterToBeDeleted(ctx context.Context, rdsClient *RDS, clusterID st | |
} | ||
} | ||
|
||
func waitForFinalSnapshotToExist(ctx context.Context, rdsClient *RDS, clusterID string) (bool, error) { | ||
vladbologa marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
ticker := time.NewTicker(awsRetrySeconds * time.Second) | ||
for { | ||
select { | ||
case <-ticker.C: | ||
snapshotOut, err := rdsClient.rdsClient.DescribeDBClusterSnapshots(&rds.DescribeDBClusterSnapshotsInput{ | ||
DBClusterSnapshotIdentifier: getFinalSnapshotID(clusterID), | ||
}) | ||
|
||
if err != nil { | ||
if awsErr, ok := err.(awserr.Error); ok { | ||
if awsErr.Code() != rds.ErrCodeDBClusterSnapshotNotFoundFault { | ||
return false, err | ||
} | ||
|
||
continue | ||
} | ||
} | ||
|
||
if snapshotOut != nil { | ||
return len(snapshotOut.DBClusterSnapshots) == 1, nil | ||
} | ||
case <-ctx.Done(): | ||
return false, fmt.Errorf("waiting for final DB snapshot: %w", ctx.Err()) | ||
} | ||
|
||
} | ||
|
||
} | ||
|
||
func TestRDSProvisioning(t *testing.T) { | ||
if os.Getenv("RUN_RDS_TESTS") != "true" { | ||
t.Skip("Skip RDS tests. Set RUN_RDS_TESTS=true env variable to enable RDS tests.") | ||
|
@@ -136,6 +162,19 @@ func TestRDSProvisioning(t *testing.T) { | |
clusterDeleted, err := waitForClusterToBeDeleted(deleteCtx, rdsClient, clusterID) | ||
require.NoError(t, err) | ||
assert.True(t, clusterDeleted) | ||
|
||
// Always attemt to delete the final snapshot if it exists | ||
defer func() { | ||
_, err := rdsClient.rdsClient.DeleteDBClusterSnapshot( | ||
&rds.DeleteDBClusterSnapshotInput{DBClusterSnapshotIdentifier: getFinalSnapshotID(clusterID)}, | ||
) | ||
|
||
assert.NoError(t, err) | ||
}() | ||
|
||
snapshotExists, err := waitForFinalSnapshotToExist(deleteCtx, rdsClient, clusterID) | ||
vladbologa marked this conversation as resolved.
Show resolved
Hide resolved
|
||
require.NoError(t, err) | ||
require.True(t, snapshotExists) | ||
} | ||
|
||
func TestGetDBConnection(t *testing.T) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: I think we can just remove this
skipFinalSnapshot
parameter completely.