Skip to content

Commit

Permalink
fix(cli): garbage collection ignores review_in_progress stacks (#31906)
Browse files Browse the repository at this point in the history
Calling this a feat because I believe technically we are updating the functionality of gc. 

Previously we were waiting for stacks in `REVIEW_IN_PROGRESS` to land, because that is the one CFN state that you cannot retrieve a template for (because it doesn't exist yet). However in environments where we are constantly deploying new stacks (like our test environments), we may never get to a state in the allotted time where no stacks are `REVIEW_IN_PROGRESS`.

Instead, we are going to ignore `REVIEW_IN_PROGRESS` stacks. This will introduce a subtle race condition where a previously isolated asset becomes in-use by the `REVIEW_IN_PROGRESS` stack before it turns into a `CREATE_IN_PROGRESS` stack and we can reference its stack. If garbage collection happens to come across the isolated asset while the stack is `REVIEW_IN_PROGRESS` (aka before it is `CREATE_IN_PROGRESS` but after CDK has verified that the assets exist) we will garbage collect the asset. However, we don't expect this to become a big problem in practice.

----

*By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license*
  • Loading branch information
kaizencc authored Oct 25, 2024
1 parent e0615fe commit cb3ecfe
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 136 deletions.
12 changes: 1 addition & 11 deletions packages/aws-cdk/lib/api/garbage-collection/garbage-collector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,6 @@ interface GarbageCollectorProps {
*/
readonly bootstrapStackName?: string;

/**
* Max wait time for retries in milliseconds (for testing purposes).
*
* @default 60000
*/
readonly maxWaitTime?: number;

/**
* Confirm with the user before actual deletion happens
*
Expand All @@ -134,7 +127,6 @@ export class GarbageCollector {
private permissionToDelete: boolean;
private permissionToTag: boolean;
private bootstrapStackName: string;
private maxWaitTime: number;
private confirm: boolean;

public constructor(readonly props: GarbageCollectorProps) {
Expand All @@ -145,7 +137,6 @@ export class GarbageCollector {

this.permissionToDelete = ['delete-tagged', 'full'].includes(props.action);
this.permissionToTag = ['tag', 'full'].includes(props.action);
this.maxWaitTime = props.maxWaitTime ?? 60000;
this.confirm = props.confirm ?? true;

this.bootstrapStackName = props.bootstrapStackName ?? DEFAULT_TOOLKIT_STACK_NAME;
Expand Down Expand Up @@ -181,13 +172,12 @@ export class GarbageCollector {
const activeAssets = new ActiveAssetCache();

// Grab stack templates first
await refreshStacks(cfn, activeAssets, this.maxWaitTime, qualifier);
await refreshStacks(cfn, activeAssets, qualifier);
// Start the background refresh
const backgroundStackRefresh = new BackgroundStackRefresh({
cfn,
activeAssets,
qualifier,
maxWaitTime: this.maxWaitTime,
});
backgroundStackRefresh.start();

Expand Down
43 changes: 6 additions & 37 deletions packages/aws-cdk/lib/api/garbage-collection/stack-refresh.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import { CloudFormation } from 'aws-sdk';
import { sleep } from '../../../test/util';
import { debug } from '../../logging';

export class ActiveAssetCache {
Expand Down Expand Up @@ -30,41 +29,18 @@ async function paginateSdkCall(cb: (nextToken?: string) => Promise<string | unde
}
}

/** We cannot operate on REVIEW_IN_PROGRESS stacks because we do not know what the template looks like in this case
* If we encounter this status, we will wait up to the maxWaitTime before erroring out
*/
async function listStacksNotBeingReviewed(cfn: CloudFormation, maxWaitTime: number, nextToken: string | undefined) {
let sleepMs = 500;
const deadline = Date.now() + maxWaitTime;

while (Date.now() <= deadline) {
let stacks = await cfn.listStacks({ NextToken: nextToken }).promise();
if (!stacks.StackSummaries?.some(s => s.StackStatus == 'REVIEW_IN_PROGRESS')) {
return stacks;
}
await sleep(Math.floor(Math.random() * sleepMs));
sleepMs = sleepMs * 2;
}

throw new Error(`Stacks still in REVIEW_IN_PROGRESS state after waiting for ${maxWaitTime} ms.`);
}

/**
* Fetches all relevant stack templates from CloudFormation. It ignores the following stacks:
* - stacks in DELETE_COMPLETE or DELETE_IN_PROGRESS stage
* - stacks that are using a different bootstrap qualifier
*
* It fails on the following stacks because we cannot get the template and therefore have an imcomplete
* understanding of what assets are being used.
* - stacks in REVIEW_IN_PROGRESS stage
*/
async function fetchAllStackTemplates(cfn: CloudFormation, maxWaitTime: number, qualifier?: string) {
async function fetchAllStackTemplates(cfn: CloudFormation, qualifier?: string) {
const stackNames: string[] = [];
await paginateSdkCall(async (nextToken) => {
const stacks = await listStacksNotBeingReviewed(cfn, maxWaitTime, nextToken);
const stacks = await cfn.listStacks({ NextToken: nextToken }).promise();

// We ignore stacks with these statuses because their assets are no longer live
const ignoredStatues = ['CREATE_FAILED', 'DELETE_COMPLETE', 'DELETE_IN_PROGRESS', 'DELETE_FAILED'];
const ignoredStatues = ['CREATE_FAILED', 'DELETE_COMPLETE', 'DELETE_IN_PROGRESS', 'DELETE_FAILED', 'REVIEW_IN_PROGRESS'];
stackNames.push(
...(stacks.StackSummaries ?? [])
.filter(s => !ignoredStatues.includes(s.StackStatus))
Expand Down Expand Up @@ -119,9 +95,9 @@ function bootstrapFilter(parameters?: CloudFormation.ParameterDeclarations, qual
splitBootstrapVersion[2] != qualifier);
}

export async function refreshStacks(cfn: CloudFormation, activeAssets: ActiveAssetCache, maxWaitTime: number, qualifier?: string) {
export async function refreshStacks(cfn: CloudFormation, activeAssets: ActiveAssetCache, qualifier?: string) {
try {
const stacks = await fetchAllStackTemplates(cfn, maxWaitTime, qualifier);
const stacks = await fetchAllStackTemplates(cfn, qualifier);
for (const stack of stacks) {
activeAssets.rememberStack(stack);
}
Expand All @@ -148,13 +124,6 @@ export interface BackgroundStackRefreshProps {
* Stack bootstrap qualifier
*/
readonly qualifier?: string;

/**
* Maximum wait time when waiting for stacks to leave REVIEW_IN_PROGRESS stage.
*
* @default 60000
*/
readonly maxWaitTime?: number;
}

/**
Expand All @@ -178,7 +147,7 @@ export class BackgroundStackRefresh {
private async refresh() {
const startTime = Date.now();

await refreshStacks(this.props.cfn, this.props.activeAssets, this.props.maxWaitTime ?? 60000, this.props.qualifier);
await refreshStacks(this.props.cfn, this.props.activeAssets, this.props.qualifier);
this.justRefreshedStacks();

// If the last invocation of refreshStacks takes <5 minutes, the next invocation starts 5 minutes after the last one started.
Expand Down
88 changes: 0 additions & 88 deletions packages/aws-cdk/test/api/garbage-collection.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ function gc(props: {
rollbackBufferDays?: number;
createdAtBufferDays?: number;
action: 'full' | 'print' | 'tag' | 'delete-tagged';
maxWaitTime?: number;
}): GarbageCollector {
return new GarbageCollector({
sdkProvider: sdk,
Expand All @@ -47,7 +46,6 @@ function gc(props: {
rollbackBufferDays: props.rollbackBufferDays ?? 0,
createdBufferDays: props.createdAtBufferDays ?? 0,
type: props.type,
maxWaitTime: props.maxWaitTime,
confirm: false,
});
}
Expand Down Expand Up @@ -435,91 +433,6 @@ describe('Garbage Collection', () => {
},
});
});

test('stackStatus in REVIEW_IN_PROGRESS means we wait until it changes', async () => {
mockTheToolkitInfo({
Outputs: [
{
OutputKey: 'BootstrapVersion',
OutputValue: '999',
},
],
});

// Mock the listStacks call
const mockListStacksStatus = jest.fn()
.mockResolvedValueOnce({
StackSummaries: [
{ StackName: 'Stack1', StackStatus: 'REVIEW_IN_PROGRESS' },
{ StackName: 'Stack2', StackStatus: 'UPDATE_COMPLETE' },
],
})
.mockResolvedValueOnce({
StackSummaries: [
{ StackName: 'Stack1', StackStatus: 'UPDATE_COMPLETE' },
{ StackName: 'Stack2', StackStatus: 'UPDATE_COMPLETE' },
],
});

sdk.stubCloudFormation({
listStacks: mockListStacksStatus,
getTemplateSummary: mockGetTemplateSummary,
getTemplate: mockGetTemplate,
});

garbageCollector = garbageCollector = gc({
type: 's3',
rollbackBufferDays: 3,
action: 'full',
});
await garbageCollector.garbageCollect();

// list are called as expected
expect(mockListStacksStatus).toHaveBeenCalledTimes(2);

// everything else runs as expected:
// assets tagged
expect(mockGetObjectTagging).toHaveBeenCalledTimes(3);
expect(mockPutObjectTagging).toHaveBeenCalledTimes(2); // one object already has the tag

// no deleting
expect(mockDeleteObjects).toHaveBeenCalledTimes(0);
}, 60000);

test('fails when stackStatus stuck in REVIEW_IN_PROGRESS', async () => {
mockTheToolkitInfo({
Outputs: [
{
OutputKey: 'BootstrapVersion',
OutputValue: '999',
},
],
});

// Mock the listStacks call
const mockListStacksStatus = jest.fn()
.mockResolvedValue({
StackSummaries: [
{ StackName: 'Stack1', StackStatus: 'REVIEW_IN_PROGRESS' },
{ StackName: 'Stack2', StackStatus: 'UPDATE_COMPLETE' },
],
});

sdk.stubCloudFormation({
listStacks: mockListStacksStatus,
getTemplateSummary: mockGetTemplateSummary,
getTemplate: mockGetTemplate,
});

garbageCollector = garbageCollector = gc({
type: 's3',
rollbackBufferDays: 3,
action: 'full',
maxWaitTime: 600, // Wait only 600 ms in tests
});

await expect(garbageCollector.garbageCollect()).rejects.toThrow(/Stacks still in REVIEW_IN_PROGRESS state after waiting/);
}, 60000);
});

let mockListObjectsV2Large: (params: AWS.S3.Types.ListObjectsV2Request) => AWS.S3.Types.ListObjectsV2Output;
Expand Down Expand Up @@ -680,7 +593,6 @@ describe('BackgroundStackRefresh', () => {
refreshProps = {
cfn: sdk.mockSdk.cloudFormation(),
activeAssets: new ActiveAssetCache(),
maxWaitTime: 60000, // 1 minute
};

backgroundRefresh = new BackgroundStackRefresh(refreshProps);
Expand Down

0 comments on commit cb3ecfe

Please sign in to comment.