Skip to content

Commit

Permalink
test(crash): Crash minor fixes (aws#5804)
Browse files Browse the repository at this point in the history
See each commit message for revelant fix


---

<!--- REMINDER: Ensure that your PR meets the guidelines in
CONTRIBUTING.md -->

License: I confirm that my contribution is made under the terms of the
Apache 2.0 license.

---------

Signed-off-by: nkomonen-amazon <[email protected]>
  • Loading branch information
nkomonen-amazon authored Oct 17, 2024
1 parent b958880 commit 192d1ea
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 63 deletions.
45 changes: 35 additions & 10 deletions packages/core/src/shared/crashMonitoring.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ export class CrashMonitoring {
private readonly checkInterval: number,
private readonly isDevMode: boolean,
private readonly isAutomation: boolean,
private readonly devLogger: Logger | undefined
private readonly devLogger: Logger | undefined,
private readonly timeLag: TimeLag
) {}

static #didTryCreate = false
Expand All @@ -92,7 +93,8 @@ export class CrashMonitoring {
DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 10), // check every 10 minutes
isDevMode,
isAutomation(),
devModeLogger
devModeLogger,
new TimeLag()
))
} catch (error) {
emitFailure({ functionName: 'instance', error })
Expand All @@ -115,7 +117,13 @@ export class CrashMonitoring {
this.heartbeat = new Heartbeat(this.state, this.checkInterval, this.isDevMode)
this.heartbeat.onFailure(() => this.cleanup())

this.crashChecker = new CrashChecker(this.state, this.checkInterval, this.isDevMode, this.devLogger)
this.crashChecker = new CrashChecker(
this.state,
this.checkInterval,
this.isDevMode,
this.devLogger,
this.timeLag
)
this.crashChecker.onFailure(() => this.cleanup())

await this.heartbeat.start()
Expand Down Expand Up @@ -236,7 +244,7 @@ class CrashChecker {
* Solution: Keep track of the lag, and then skip the next crash check if there was a lag. This will give time for the
* next heartbeat to be sent.
*/
private readonly timeLag: TimeLag = new TimeLag()
private readonly timeLag: TimeLag
) {}

public async start() {
Expand Down Expand Up @@ -391,7 +399,7 @@ export async function crashMonitoringStateFactory(deps = getDefaultDependencies(
* - is not truly reliable since filesystems are not reliable
*/
export class FileSystemState {
private readonly stateDirPath: string
public readonly stateDirPath: string

/**
* IMORTANT: Use {@link crashMonitoringStateFactory} to make an instance
Expand Down Expand Up @@ -512,9 +520,10 @@ export class FileSystemState {
protected createExtId(ext: ExtInstance): ExtInstanceId {
return `${ext.extHostPid}_${ext.sessionId}`
}
private readonly fileSuffix = 'running'
private makeStateFilePath(ext: ExtInstance | ExtInstanceId) {
const extId = typeof ext === 'string' ? ext : this.createExtId(ext)
return path.join(this.stateDirPath, extId)
return path.join(this.stateDirPath, extId + `.${this.fileSuffix}`)
}
public async clearState(): Promise<void> {
this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Started')
Expand All @@ -525,10 +534,26 @@ export class FileSystemState {
}
public async getAllExts(): Promise<ExtInstanceHeartbeat[]> {
const res = await withFailCtx('getAllExts', async () => {
// The file names are intentionally the IDs for easy mapping
const allExtIds: ExtInstanceId[] = await withFailCtx('readdir', async () =>
(await fs.readdir(this.stateDirPath)).map((k) => k[0])
)
// Read all the exts from the filesystem, deserializing as needed
const allExtIds: ExtInstanceId[] = await withFailCtx('readdir', async () => {
const filesInDir = await fs.readdir(this.stateDirPath)
const relevantFiles = filesInDir.filter((file: [string, vscode.FileType]) => {
const name = file[0]
const type = file[1]
if (type !== vscode.FileType.File) {
return false
}
if (path.extname(name) !== `.${this.fileSuffix}`) {
return false
}
return true
})
const idsFromFileNames = relevantFiles.map((file: [string, vscode.FileType]) => {
const name = file[0]
return name.split('.')[0]
})
return idsFromFileNames
})

const allExts = allExtIds.map<Promise<ExtInstanceHeartbeat | undefined>>(async (extId: string) => {
// Due to a race condition, a separate extension instance may have removed this file by this point. It is okay since
Expand Down
185 changes: 132 additions & 53 deletions packages/core/src/test/shared/crashMonitoring.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ import globals from '../../shared/extensionGlobals'
import { CrashMonitoring, ExtInstance, crashMonitoringStateFactory } from '../../shared/crashMonitoring'
import { isCI } from '../../shared/vscode/env'
import { getLogger } from '../../shared/logger/logger'
import { TimeLag } from '../../shared/utilities/timeoutUtils'
import { SinonSandbox, createSandbox } from 'sinon'
import { fs } from '../../shared'
import path from 'path'

class TestCrashMonitoring extends CrashMonitoring {
public constructor(...deps: ConstructorParameters<typeof CrashMonitoring>) {
Expand All @@ -24,6 +28,8 @@ class TestCrashMonitoring extends CrashMonitoring {
export const crashMonitoringTest = async () => {
let testFolder: TestFolder
let spawnedExtensions: TestCrashMonitoring[]
let timeLag: TimeLag
let sandbox: SinonSandbox

// Scale down the default interval we heartbeat and check for crashes to something much short for testing.
const checkInterval = 200
Expand All @@ -38,48 +44,54 @@ export const crashMonitoringTest = async () => {
* 1:1 mapping between Crash Reporting instances and the Extension instances.
*/
async function makeTestExtensions(amount: number) {
const devLogger = getLogger()

const extensions: TestExtension[] = []
for (let i = 0; i < amount; i++) {
const sessionId = `sessionId-${i}`
const pid = Number(String(i).repeat(6))
const state = await crashMonitoringStateFactory({
workDirPath: testFolder.path,
isStateStale: async () => false,
pid,
sessionId: sessionId,
now: () => globals.clock.Date.now(),
memento: globals.globalState,
isDevMode: true,
devLogger,
})
const ext = new TestCrashMonitoring(state, checkInterval, true, false, devLogger)
spawnedExtensions.push(ext)
const metadata = {
extHostPid: pid,
sessionId,
lastHeartbeat: globals.clock.Date.now(),
isDebug: undefined,
}
extensions[i] = { ext, metadata }
extensions[i] = await makeTestExtension(i, new TimeLag())
}
return extensions
}

async function makeTestExtension(id: number, timeLag: TimeLag, opts?: { isStateStale: () => Promise<boolean> }) {
const isStateStale = opts?.isStateStale ?? (() => Promise.resolve(false))
const sessionId = `sessionId-${id}`
const pid = Number(String(id).repeat(6))

const state = await crashMonitoringStateFactory({
workDirPath: testFolder.path,
isStateStale,
pid,
sessionId: sessionId,
now: () => globals.clock.Date.now(),
memento: globals.globalState,
isDevMode: true,
devLogger: getLogger(),
})
const ext = new TestCrashMonitoring(state, checkInterval, true, false, getLogger(), timeLag)
spawnedExtensions.push(ext)
const metadata = {
extHostPid: pid,
sessionId,
lastHeartbeat: globals.clock.Date.now(),
isDebug: undefined,
}
return { ext, metadata }
}

beforeEach(async function () {
testFolder = await TestFolder.create()
spawnedExtensions = []
timeLag = new TimeLag()
sandbox = createSandbox()
})

afterEach(async function () {
// clean up all running instances
spawnedExtensions?.forEach((e) => e.crash())
timeLag.cleanup()
sandbox.restore()
})

it('graceful shutdown no metric emitted', async function () {
// this.retries(3)

const exts = await makeTestExtensions(2)

await exts[0].ext.start()
Expand All @@ -95,9 +107,7 @@ export const crashMonitoringTest = async () => {
assertTelemetry('session_end', [])
})

it('single running instances crashes, so nothing is reported, but a new instaces appears and reports', async function () {
// this.retries(3)

it('single running instance crashes, so nothing is reported, but a new instaces appears and reports', async function () {
const exts = await makeTestExtensions(2)

await exts[0].ext.start()
Expand All @@ -112,34 +122,11 @@ export const crashMonitoringTest = async () => {
assertCrashedExtensions([exts[0]])
})

it('start the first extension, then start many subsequent ones and crash them all at once', async function () {
// this.retries(3)
const latestCrashedExts: TestExtension[] = []

const extCount = 10
const exts = await makeTestExtensions(extCount)
for (let i = 0; i < extCount; i++) {
await exts[i].ext.start()
}

// Crash all exts except the 0th one
for (let i = 1; i < extCount; i++) {
await exts[i].ext.crash()
latestCrashedExts.push(exts[i])
}

// Give some extra time since there is a lot of file i/o
await awaitIntervals(oneInterval * 2)

assertCrashedExtensions(latestCrashedExts)
})

it('the Primary checker crashes and another checker is promoted to Primary', async function () {
// this.retries(3)
it('multiple running instances start+crash at different times, but another instance always reports', async function () {
const latestCrashedExts: TestExtension[] = []

const exts = await makeTestExtensions(4)
// Ext 0 is the Primary checker

await exts[0].ext.start()
await awaitIntervals(oneInterval)

Expand All @@ -166,6 +153,73 @@ export const crashMonitoringTest = async () => {
assertCrashedExtensions(latestCrashedExts)
})

it('clears the state when a new os session is determined', async function () {
const exts = await makeTestExtensions(1)

// Start an extension then crash it
await exts[0].ext.start()
await exts[0].ext.crash()
await awaitIntervals(oneInterval)
// There is no other active instance to report the issue
assertTelemetry('session_end', [])

// This extension clears the state due to it being stale, not reporting the previously crashed ext
const ext1 = await makeTestExtension(1, timeLag, { isStateStale: () => Promise.resolve(true) })
await ext1.ext.start()
await awaitIntervals(oneInterval * 1)
assertCrashedExtensions([])
})

it('start the first extension, then start many subsequent ones and crash them all at once', async function () {
const latestCrashedExts: TestExtension[] = []

const extCount = 10
const exts = await makeTestExtensions(extCount)
for (let i = 0; i < extCount; i++) {
await exts[i].ext.start()
}

// Crash all exts except the 0th one
for (let i = 1; i < extCount; i++) {
await exts[i].ext.crash()
latestCrashedExts.push(exts[i])
}

// Give some extra time since there is a lot of file i/o
await awaitIntervals(oneInterval * 2)

assertCrashedExtensions(latestCrashedExts)
})

it('does not check for crashes when there is a time lag', async function () {
// This test handles the case for a users computer doing a sleep+wake and
// then a crash was incorrectly reported since a new heartbeat could not be sent in time

const timeLagStub = sandbox.stub(timeLag)
timeLagStub.start.resolves()
timeLagStub.didLag.resolves(false)

// Load up a crash
const ext0 = await makeTestExtension(0, timeLagStub as unknown as TimeLag)
await ext0.ext.start()
await ext0.ext.crash()

const ext1 = await makeTestExtension(1, timeLagStub as unknown as TimeLag)
await ext1.ext.start()

// Indicate that we have a time lag, and until it returns false
// we will skip crash checking
timeLagStub.didLag.resolves(true)
await awaitIntervals(oneInterval)
assertCrashedExtensions([])
await awaitIntervals(oneInterval)
assertCrashedExtensions([])
// Now that the time lag is true, we will check for a crash
timeLagStub.didLag.resolves(false)
await awaitIntervals(oneInterval)
assertCrashedExtensions([ext0])
})

/**
* Something like the following code can switch contexts early and the test will
* finish before it has completed. Certain async functions that may take longer to run
Expand Down Expand Up @@ -219,6 +273,31 @@ export const crashMonitoringTest = async () => {
function deduplicate<T>(array: T[], predicate: (a: T, b: T) => boolean): T[] {
return array.filter((item, index, self) => index === self.findIndex((t) => predicate(item, t)))
}

describe('FileSystemState', async function () {
it('ignores irrelevant files in state', async function () {
const state = await crashMonitoringStateFactory({
workDirPath: testFolder.path,
isStateStale: () => Promise.resolve(false),
pid: 1111,
sessionId: 'sessionId_1111',
now: () => globals.clock.Date.now(),
memento: globals.globalState,
isDevMode: true,
devLogger: getLogger(),
})
const stateDirPath = state.stateDirPath

assert.deepStrictEqual((await fs.readdir(stateDirPath)).length, 0)
await fs.writeFile(path.join(stateDirPath, 'ignoreMe.json'), '')
await fs.mkdir(path.join(stateDirPath, 'ignoreMe'))
await state.sendHeartbeat() // creates a relevant file in the state
assert.deepStrictEqual((await fs.readdir(stateDirPath)).length, 3)

const result = await state.getAllExts()
assert.deepStrictEqual(result.length, 1)
})
})
}
// This test is slow, so we only want to run it locally and not in CI. It will be run in the integ CI tests though.
;(isCI() ? describe.skip : describe)('CrashReporting', crashMonitoringTest)
Expand Down

0 comments on commit 192d1ea

Please sign in to comment.