From c00a164e49a3cc48eb38aefc9bac315fe9e5c9ce Mon Sep 17 00:00:00 2001 From: Simon Emms Date: Wed, 4 Aug 2021 13:50:29 +0000 Subject: [PATCH] fix(workspace): force-stop workspaces stuck in a bad state --- .../ws-manager-bridge-configmap.yaml | 5 ++ .../server/src/workspace/workspace-starter.ts | 2 +- components/ws-manager-bridge/src/config.ts | 7 ++ .../ws-manager-bridge/src/container-module.ts | 3 + components/ws-manager-bridge/src/main.ts | 4 ++ .../src/meta-instance-controller.ts | 69 +++++++++++++++++++ 6 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 components/ws-manager-bridge/src/meta-instance-controller.ts diff --git a/chart/templates/ws-manager-bridge-configmap.yaml b/chart/templates/ws-manager-bridge-configmap.yaml index 3de3ed00e014dc..ab537429dfce7b 100644 --- a/chart/templates/ws-manager-bridge-configmap.yaml +++ b/chart/templates/ws-manager-bridge-configmap.yaml @@ -25,6 +25,11 @@ data: "host": "localhost", "port": "8080" }, + "timeouts": { + "metaInstanceCheckIntervalSeconds": 60, + "preparingPhaseSeconds": 7200, + "unknownPhaseSeconds": 600 + }, "staticBridges": {{ index (include "ws-manager-list" (dict "root" . "gp" $.Values "comp" .Values.components.server) | fromYaml) "manager" | default list | toJson }} } {{- end -}} diff --git a/components/server/src/workspace/workspace-starter.ts b/components/server/src/workspace/workspace-starter.ts index 4ca565f75ae5dc..62b4efc8f5e60a 100644 --- a/components/server/src/workspace/workspace-starter.ts +++ b/components/server/src/workspace/workspace-starter.ts @@ -303,7 +303,7 @@ export class WorkspaceStarter { workspaceId: workspace.id, creationTime: now, ideUrl: '', // Initially empty, filled during starting process - region: '', // Initially empty, filled during starting process + region: this.env.installationShortname, // Shortname set to bridge can cleanup workspaces stuck preparing workspaceImage: '', // Initially empty, filled during starting process status: { conditions: {}, diff --git a/components/ws-manager-bridge/src/config.ts b/components/ws-manager-bridge/src/config.ts index 137c90f35bc585..07bdfe113fdba5 100644 --- a/components/ws-manager-bridge/src/config.ts +++ b/components/ws-manager-bridge/src/config.ts @@ -28,4 +28,11 @@ export interface Configuration { // maxTimeToRunningPhaseSeconds is the time that we are willing to give a workspce instance in which it has to reach a running state maxTimeToRunningPhaseSeconds: number; + + // timeouts configures the timeout behaviour of pre-workspace cluster workspaces + timeouts: { + metaInstanceCheckIntervalSeconds: number; + preparingPhaseSeconds: number; + unknownPhaseSeconds: number; + } } diff --git a/components/ws-manager-bridge/src/container-module.ts b/components/ws-manager-bridge/src/container-module.ts index c3db87a694230c..11915b7974f08e 100644 --- a/components/ws-manager-bridge/src/container-module.ts +++ b/components/ws-manager-bridge/src/container-module.ts @@ -22,6 +22,7 @@ import { WorkspaceManagerClientProviderCompositeSource, WorkspaceManagerClientPr import { ClusterService, ClusterServiceServer } from './cluster-service-server'; import { IAnalyticsWriter } from '@gitpod/gitpod-protocol/lib/analytics'; import { newAnalyticsWriterFromEnv } from '@gitpod/gitpod-protocol/lib/util/analytics'; +import { MetaInstanceController } from './meta-instance-controller'; export const containerModule = new ContainerModule(bind => { @@ -31,6 +32,8 @@ export const containerModule = new ContainerModule(bind => { bind(BridgeController).toSelf().inSingletonScope(); + bind(MetaInstanceController).toSelf().inSingletonScope(); + bind(WorkspaceManagerClientProvider).toSelf().inSingletonScope(); bind(WorkspaceManagerClientProviderCompositeSource).toSelf().inSingletonScope(); bind(WorkspaceManagerClientProviderSource).to(WorkspaceManagerClientProviderConfigSource).inSingletonScope(); diff --git a/components/ws-manager-bridge/src/main.ts b/components/ws-manager-bridge/src/main.ts index 6598b4a6e6a303..4982a0d4b4a264 100644 --- a/components/ws-manager-bridge/src/main.ts +++ b/components/ws-manager-bridge/src/main.ts @@ -13,6 +13,7 @@ import { TypeORM } from '@gitpod/gitpod-db/lib/typeorm/typeorm'; import { TracingManager } from '@gitpod/gitpod-protocol/lib/util/tracing'; import { ClusterServiceServer } from './cluster-service-server'; import { BridgeController } from './bridge-controller'; +import { MetaInstanceController } from './meta-instance-controller'; log.enableJSONLogging('ws-manager-bridge', process.env.VERSION); @@ -43,6 +44,9 @@ export const start = async (container: Container) => { const clusterServiceServer = container.get(ClusterServiceServer); await clusterServiceServer.start(); + const metaInstanceController = container.get(MetaInstanceController); + metaInstanceController.start(); + process.on('SIGTERM', async () => { log.info("SIGTERM received, stopping"); bridgeController.dispose(); diff --git a/components/ws-manager-bridge/src/meta-instance-controller.ts b/components/ws-manager-bridge/src/meta-instance-controller.ts new file mode 100644 index 00000000000000..bcd7c2ab222d9a --- /dev/null +++ b/components/ws-manager-bridge/src/meta-instance-controller.ts @@ -0,0 +1,69 @@ +/** + * Copyright (c) 2021 Gitpod GmbH. All rights reserved. + * Licensed under the GNU Affero General Public License (AGPL). + * See License-AGPL.txt in the project root for license information. + */ + +import { inject, injectable } from "inversify"; +import { WorkspaceDB } from "@gitpod/gitpod-db/lib/workspace-db"; +import { log } from "@gitpod/gitpod-protocol/lib/util/logging"; +import { RunningWorkspaceInfo } from "@gitpod/gitpod-protocol/lib"; +import { MessageBusIntegration } from "./messagebus-integration"; +import { Configuration } from "./config"; + +@injectable() +export class MetaInstanceController { + @inject(Configuration) + protected readonly config: Configuration; + + @inject(MessageBusIntegration) + protected readonly messagebus: MessageBusIntegration; + + @inject(WorkspaceDB) + protected readonly workspaceDB: WorkspaceDB; + + protected async checkAndStopWorkspaces() { + const instances = await this.workspaceDB.findRunningInstancesWithWorkspaces(this.config.installation); + + await Promise.all(instances.map(async (instance: RunningWorkspaceInfo) => { + const logContext = { instanceId: instance.latestInstance.id }; + + try { + log.debug(logContext, 'MetaInstanceController: Checking for workspaces to stop'); + + const creationTime = new Date(instance.latestInstance.creationTime).getTime(); + const preparingKillTime = creationTime + (this.config.timeouts.preparingPhaseSeconds * 1000); + const unknownKillTime = creationTime + (this.config.timeouts.unknownPhaseSeconds * 1000); + const exceededPreparingTime = Date.now() >= preparingKillTime; + const exceededUnknownTime = Date.now() >= unknownKillTime; + const currentState = instance.latestInstance.status.phase; + + if ((currentState === 'preparing' && exceededPreparingTime) || (currentState === 'unknown' && exceededUnknownTime)) { + log.info(logContext, 'MetaInstanceController: Setting workspace instance to stopped', { + creationTime, + preparingKillTime, + unknownKillTime, + currentState + }); + + instance.latestInstance.status.phase = 'stopped'; + + await this.workspaceDB.storeInstance(instance.latestInstance); + await this.messagebus.notifyOnInstanceUpdate({}, instance.workspace.ownerId, instance.latestInstance); + } + } catch (err) { + log.error(logContext, 'MetaInstanceController: Error whilst stopping workspace instance', err); + } + })); + } + + public start() { + log.debug('MetaInstanceController: Starting interval to check for workspaces to stop', { + interval: this.config.timeouts.metaInstanceCheckIntervalSeconds + }); + + setInterval(() => { + this.checkAndStopWorkspaces(); + }, this.config.timeouts.metaInstanceCheckIntervalSeconds * 1000); + } +}