Skip to content

Commit

Permalink
fix(workspace): force-stop workspaces stuck in a bad state
Browse files Browse the repository at this point in the history
  • Loading branch information
Simon Emms committed Aug 6, 2021
1 parent b09ccef commit c00a164
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 1 deletion.
5 changes: 5 additions & 0 deletions chart/templates/ws-manager-bridge-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ data:
"host": "localhost",
"port": "8080"
},
"timeouts": {
"metaInstanceCheckIntervalSeconds": 60,
"preparingPhaseSeconds": 7200,
"unknownPhaseSeconds": 600
},
"staticBridges": {{ index (include "ws-manager-list" (dict "root" . "gp" $.Values "comp" .Values.components.server) | fromYaml) "manager" | default list | toJson }}
}
{{- end -}}
Expand Down
2 changes: 1 addition & 1 deletion components/server/src/workspace/workspace-starter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ export class WorkspaceStarter {
workspaceId: workspace.id,
creationTime: now,
ideUrl: '', // Initially empty, filled during starting process
region: '', // Initially empty, filled during starting process
region: this.env.installationShortname, // Shortname set to bridge can cleanup workspaces stuck preparing
workspaceImage: '', // Initially empty, filled during starting process
status: {
conditions: {},
Expand Down
7 changes: 7 additions & 0 deletions components/ws-manager-bridge/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,11 @@ export interface Configuration {

// maxTimeToRunningPhaseSeconds is the time that we are willing to give a workspce instance in which it has to reach a running state
maxTimeToRunningPhaseSeconds: number;

// timeouts configures the timeout behaviour of pre-workspace cluster workspaces
timeouts: {
metaInstanceCheckIntervalSeconds: number;
preparingPhaseSeconds: number;
unknownPhaseSeconds: number;
}
}
3 changes: 3 additions & 0 deletions components/ws-manager-bridge/src/container-module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import { WorkspaceManagerClientProviderCompositeSource, WorkspaceManagerClientPr
import { ClusterService, ClusterServiceServer } from './cluster-service-server';
import { IAnalyticsWriter } from '@gitpod/gitpod-protocol/lib/analytics';
import { newAnalyticsWriterFromEnv } from '@gitpod/gitpod-protocol/lib/util/analytics';
import { MetaInstanceController } from './meta-instance-controller';

export const containerModule = new ContainerModule(bind => {

Expand All @@ -31,6 +32,8 @@ export const containerModule = new ContainerModule(bind => {

bind(BridgeController).toSelf().inSingletonScope();

bind(MetaInstanceController).toSelf().inSingletonScope();

bind(WorkspaceManagerClientProvider).toSelf().inSingletonScope();
bind(WorkspaceManagerClientProviderCompositeSource).toSelf().inSingletonScope();
bind(WorkspaceManagerClientProviderSource).to(WorkspaceManagerClientProviderConfigSource).inSingletonScope();
Expand Down
4 changes: 4 additions & 0 deletions components/ws-manager-bridge/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import { TypeORM } from '@gitpod/gitpod-db/lib/typeorm/typeorm';
import { TracingManager } from '@gitpod/gitpod-protocol/lib/util/tracing';
import { ClusterServiceServer } from './cluster-service-server';
import { BridgeController } from './bridge-controller';
import { MetaInstanceController } from './meta-instance-controller';

log.enableJSONLogging('ws-manager-bridge', process.env.VERSION);

Expand Down Expand Up @@ -43,6 +44,9 @@ export const start = async (container: Container) => {
const clusterServiceServer = container.get<ClusterServiceServer>(ClusterServiceServer);
await clusterServiceServer.start();

const metaInstanceController = container.get<MetaInstanceController>(MetaInstanceController);
metaInstanceController.start();

process.on('SIGTERM', async () => {
log.info("SIGTERM received, stopping");
bridgeController.dispose();
Expand Down
69 changes: 69 additions & 0 deletions components/ws-manager-bridge/src/meta-instance-controller.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/**
* Copyright (c) 2021 Gitpod GmbH. All rights reserved.
* Licensed under the GNU Affero General Public License (AGPL).
* See License-AGPL.txt in the project root for license information.
*/

import { inject, injectable } from "inversify";
import { WorkspaceDB } from "@gitpod/gitpod-db/lib/workspace-db";
import { log } from "@gitpod/gitpod-protocol/lib/util/logging";
import { RunningWorkspaceInfo } from "@gitpod/gitpod-protocol/lib";
import { MessageBusIntegration } from "./messagebus-integration";
import { Configuration } from "./config";

@injectable()
export class MetaInstanceController {
@inject(Configuration)
protected readonly config: Configuration;

@inject(MessageBusIntegration)
protected readonly messagebus: MessageBusIntegration;

@inject(WorkspaceDB)
protected readonly workspaceDB: WorkspaceDB;

protected async checkAndStopWorkspaces() {
const instances = await this.workspaceDB.findRunningInstancesWithWorkspaces(this.config.installation);

await Promise.all(instances.map(async (instance: RunningWorkspaceInfo) => {
const logContext = { instanceId: instance.latestInstance.id };

try {
log.debug(logContext, 'MetaInstanceController: Checking for workspaces to stop');

const creationTime = new Date(instance.latestInstance.creationTime).getTime();
const preparingKillTime = creationTime + (this.config.timeouts.preparingPhaseSeconds * 1000);
const unknownKillTime = creationTime + (this.config.timeouts.unknownPhaseSeconds * 1000);
const exceededPreparingTime = Date.now() >= preparingKillTime;
const exceededUnknownTime = Date.now() >= unknownKillTime;
const currentState = instance.latestInstance.status.phase;

if ((currentState === 'preparing' && exceededPreparingTime) || (currentState === 'unknown' && exceededUnknownTime)) {
log.info(logContext, 'MetaInstanceController: Setting workspace instance to stopped', {
creationTime,
preparingKillTime,
unknownKillTime,
currentState
});

instance.latestInstance.status.phase = 'stopped';

await this.workspaceDB.storeInstance(instance.latestInstance);
await this.messagebus.notifyOnInstanceUpdate({}, instance.workspace.ownerId, instance.latestInstance);
}
} catch (err) {
log.error(logContext, 'MetaInstanceController: Error whilst stopping workspace instance', err);
}
}));
}

public start() {
log.debug('MetaInstanceController: Starting interval to check for workspaces to stop', {
interval: this.config.timeouts.metaInstanceCheckIntervalSeconds
});

setInterval(() => {
this.checkAndStopWorkspaces();
}, this.config.timeouts.metaInstanceCheckIntervalSeconds * 1000);
}
}

0 comments on commit c00a164

Please sign in to comment.