From 31106295cfe29c208082a630861ff6fca8473e71 Mon Sep 17 00:00:00 2001 From: Milan Pavlik Date: Wed, 20 Apr 2022 11:54:14 +0000 Subject: [PATCH] [ws-man] Add started and completed metrics to track health --- components/ws-manager-bridge/src/bridge.ts | 176 ++++++++++++++---- .../src/prometheus-metrics-exporter.ts | 108 +++++++---- 2 files changed, 212 insertions(+), 72 deletions(-) diff --git a/components/ws-manager-bridge/src/bridge.ts b/components/ws-manager-bridge/src/bridge.ts index 20397a7bd03c7c..73bd5d69309aed 100644 --- a/components/ws-manager-bridge/src/bridge.ts +++ b/components/ws-manager-bridge/src/bridge.ts @@ -6,14 +6,30 @@ import { inject, injectable, interfaces } from "inversify"; import { MessageBusIntegration } from "./messagebus-integration"; -import { Disposable, WorkspaceInstance, Queue, WorkspaceInstancePort, PortVisibility, RunningWorkspaceInfo, DisposableCollection } from "@gitpod/gitpod-protocol"; -import { WorkspaceStatus, WorkspacePhase, GetWorkspacesRequest, WorkspaceConditionBool, PortVisibility as WsManPortVisibility, PromisifiedWorkspaceManagerClient } from "@gitpod/ws-manager/lib"; +import { + Disposable, + DisposableCollection, + PortVisibility, + Queue, + RunningWorkspaceInfo, + WorkspaceInstance, + WorkspaceInstancePort, +} from "@gitpod/gitpod-protocol"; +import { + GetWorkspacesRequest, + PortVisibility as WsManPortVisibility, + PromisifiedWorkspaceManagerClient, + WorkspaceConditionBool, + WorkspacePhase, + WorkspaceStatus, + WorkspaceType, +} from "@gitpod/ws-manager/lib"; import { WorkspaceDB } from "@gitpod/gitpod-db/lib/workspace-db"; import { UserDB } from "@gitpod/gitpod-db/lib/user-db"; -import { log } from '@gitpod/gitpod-protocol/lib/util/logging'; +import { log } from "@gitpod/gitpod-protocol/lib/util/logging"; import { TraceContext } from "@gitpod/gitpod-protocol/lib/util/tracing"; import { IAnalyticsWriter } from "@gitpod/gitpod-protocol/lib/analytics"; -import { TracedWorkspaceDB, TracedUserDB, DBWithTracing } from '@gitpod/gitpod-db/lib/traced-db'; +import { DBWithTracing, TracedUserDB, TracedWorkspaceDB } from "@gitpod/gitpod-db/lib/traced-db"; import { PrometheusMetricsExporter } from "./prometheus-metrics-exporter"; import { ClientProvider, WsmanSubscriber } from "./wsman-subscriber"; import { Timestamp } from "google-protobuf/google/protobuf/timestamp_pb"; @@ -21,6 +37,7 @@ import { Configuration } from "./config"; import { WorkspaceCluster } from "@gitpod/gitpod-protocol/lib/workspace-cluster"; import { repeat } from "@gitpod/gitpod-protocol/lib/util/repeat"; import { PreparingUpdateEmulator, PreparingUpdateEmulatorFactory } from "./preparing-update-emulator"; +import { performance } from "perf_hooks"; export const WorkspaceManagerBridgeFactory = Symbol("WorkspaceManagerBridgeFactory"); @@ -71,7 +88,7 @@ export class WorkspaceManagerBridge implements Disposable { log.debug(`Starting status update handler: ${cluster.name}`, logPayload); /* no await */ this.startStatusUpdateHandler(clientProvider, writeToDB, logPayload) // this is a mere safe-guard: we do not expect the code inside to fail - .catch(err => log.error("Cannot start status update handler", err)); + .catch((err) => log.error("Cannot start status update handler", err)); }; if (cluster.govern) { @@ -102,20 +119,41 @@ export class WorkspaceManagerBridge implements Disposable { this.dispose(); } - protected async startStatusUpdateHandler(clientProvider: ClientProvider, writeToDB: boolean, logPayload: {}): Promise { + protected async startStatusUpdateHandler( + clientProvider: ClientProvider, + writeToDB: boolean, + logPayload: {}, + ): Promise { const subscriber = new WsmanSubscriber(clientProvider); this.disposables.push(subscriber); const onReconnect = (ctx: TraceContext, s: WorkspaceStatus[]) => { - s.forEach(sx => this.serializeMessagesByInstanceId(ctx, sx, m => m.getId(), (ctx, msg) => this.handleStatusUpdate(ctx, msg, writeToDB))) + s.forEach((sx) => + this.serializeMessagesByInstanceId( + ctx, + sx, + (m) => m.getId(), + (ctx, msg) => this.handleStatusUpdate(ctx, msg, writeToDB), + ), + ); }; const onStatusUpdate = (ctx: TraceContext, s: WorkspaceStatus) => { - this.serializeMessagesByInstanceId(ctx, s, msg => msg.getId(), (ctx, s) => this.handleStatusUpdate(ctx, s, writeToDB)) + this.serializeMessagesByInstanceId( + ctx, + s, + (msg) => msg.getId(), + (ctx, s) => this.handleStatusUpdate(ctx, s, writeToDB), + ); }; await subscriber.subscribe({ onReconnect, onStatusUpdate }, logPayload); } - protected serializeMessagesByInstanceId(ctx: TraceContext, msg: M, getInstanceId: (msg: M) => string, handler: (ctx: TraceContext, msg: M) => Promise) { + protected serializeMessagesByInstanceId( + ctx: TraceContext, + msg: M, + getInstanceId: (msg: M) => string, + handler: (ctx: TraceContext, msg: M) => Promise, + ) { const instanceId = getInstanceId(msg); if (!instanceId) { log.warn("Received invalid message, could not read instanceId!", { msg }); @@ -125,18 +163,48 @@ export class WorkspaceManagerBridge implements Disposable { // We can't just handle the status update directly, but have to "serialize" it to ensure the updates stay in order. // If we did not do this, the async nature of our code would allow for one message to overtake the other. let q = this.queues.get(instanceId) || new Queue(); - q.enqueue(() => handler(ctx, msg)).catch(e => log.error({instanceId}, e)); + q.enqueue(() => handler(ctx, msg)).catch((e) => log.error({ instanceId }, e)); this.queues.set(instanceId, q); } protected async handleStatusUpdate(ctx: TraceContext, rawStatus: WorkspaceStatus, writeToDB: boolean) { + const start = performance.now(); const status = rawStatus.toObject(); log.info("Handling WorkspaceStatus update", status); + if (!status.spec || !status.metadata || !status.conditions) { log.warn("Received invalid status update", status); return; } + try { + await this.statusUpdate(ctx, rawStatus, writeToDB); + } catch (e) { + const durationMs = performance.now() - start; + this.prometheusExporter.reportWorkspaceInstanceUpdateCompleted( + durationMs / 1000, + this.cluster.name, + status.spec.type, + e, + ); + } finally { + const durationMs = performance.now() - start; + this.prometheusExporter.reportWorkspaceInstanceUpdateCompleted( + durationMs / 1000, + this.cluster.name, + status.spec.type, + ); + } + } + + private async statusUpdate(ctx: TraceContext, rawStatus: WorkspaceStatus, writeToDB: boolean) { + const status = rawStatus.toObject(); + log.info("Handling WorkspaceStatus update", status); + + if (!status.spec || !status.metadata || !status.conditions) { + return; + } + const span = TraceContext.startSpan("handleStatusUpdate", ctx); span.setTag("status", JSON.stringify(filterStatus(status))); span.setTag("writeToDB", writeToDB); @@ -149,7 +217,7 @@ export class WorkspaceManagerBridge implements Disposable { const userId = status.metadata!.owner!; const logCtx = { instanceId, workspaceId, userId }; - const instance = await this.workspaceDB.trace({span}).findInstanceById(instanceId); + const instance = await this.workspaceDB.trace({ span }).findInstanceById(instanceId); if (instance) { this.prometheusExporter.statusUpdateReceived(this.cluster.name, true); } else { @@ -159,7 +227,7 @@ export class WorkspaceManagerBridge implements Disposable { } if (!!status.spec.exposedPortsList) { - instance.status.exposedPorts = status.spec.exposedPortsList.map(p => { + instance.status.exposedPorts = status.spec.exposedPortsList.map((p) => { return { port: p.port, visibility: mapPortVisibility(p.visibility), @@ -180,7 +248,9 @@ export class WorkspaceManagerBridge implements Disposable { instance.status.conditions.pullingImages = toBool(status.conditions.pullingImages!); instance.status.conditions.deployed = toBool(status.conditions.deployed); instance.status.conditions.timeout = status.conditions.timeout; - instance.status.conditions.firstUserActivity = mapFirstUserActivity(rawStatus.getConditions()!.getFirstUserActivity()); + instance.status.conditions.firstUserActivity = mapFirstUserActivity( + rawStatus.getConditions()!.getFirstUserActivity(), + ); instance.status.conditions.headlessTaskFailed = status.conditions.headlessTaskFailed; instance.status.conditions.stoppedByRequest = toBool(status.conditions.stoppedByRequest); instance.status.message = status.message; @@ -191,7 +261,7 @@ export class WorkspaceManagerBridge implements Disposable { if (status.repo) { const r = status.repo; - const undefinedIfEmpty = (l: T[]) => l.length > 0 ? l : undefined; + const undefinedIfEmpty = (l: T[]) => (l.length > 0 ? l : undefined); instance.status.repo = { branch: r.branch, @@ -201,8 +271,8 @@ export class WorkspaceManagerBridge implements Disposable { unpushedCommits: undefinedIfEmpty(r.unpushedCommitsList), totalUntrackedFiles: r.totalUntrackedFiles, untrackedFiles: undefinedIfEmpty(r.untrackedFilesList), - totalUnpushedCommits: r.totalUnpushedCommits - } + totalUnpushedCommits: r.totalUnpushedCommits, + }; } if (instance.status.conditions.deployed && !instance.deployedTime) { @@ -238,7 +308,7 @@ export class WorkspaceManagerBridge implements Disposable { instance.status.phase = "interrupted"; break; case WorkspacePhase.STOPPING: - if (instance.status.phase != 'stopped') { + if (instance.status.phase != "stopped") { instance.status.phase = "stopping"; if (!instance.stoppingTime) { // The first time a workspace enters stopping we record that as it's stoppingTime time. @@ -259,14 +329,14 @@ export class WorkspaceManagerBridge implements Disposable { // yet. Just for this case we need to set it now. instance.stoppingTime = now; } - lifecycleHandler = () => this.onInstanceStopped({span}, userId, instance); + lifecycleHandler = () => this.onInstanceStopped({ span }, userId, instance); break; } span.setTag("after", JSON.stringify(instance)); // now notify all prebuild listeners about updates - and update DB if needed - await this.updatePrebuiltWorkspace({span}, userId, status, writeToDB); + await this.updatePrebuiltWorkspace({ span }, userId, status, writeToDB); if (writeToDB) { await this.workspaceDB.trace(ctx).storeInstance(instance); @@ -280,16 +350,20 @@ export class WorkspaceManagerBridge implements Disposable { } } await this.messagebus.notifyOnInstanceUpdate(ctx, userId, instance); - } catch (e) { - TraceContext.setError({span}, e); + TraceContext.setError({ span }, e); throw e; } finally { span.finish(); } } - protected startController(clientProvider: ClientProvider, controllerIntervalSeconds: number, controllerMaxDisconnectSeconds: number, maxTimeToRunningPhaseSeconds = 60 * 60) { + protected startController( + clientProvider: ClientProvider, + controllerIntervalSeconds: number, + controllerMaxDisconnectSeconds: number, + maxTimeToRunningPhaseSeconds = 60 * 60, + ) { let disconnectStarted = Number.MAX_SAFE_INTEGER; this.disposables.push( repeat(async () => { @@ -297,39 +371,57 @@ export class WorkspaceManagerBridge implements Disposable { const client = await clientProvider(); await this.controlInstallationInstances(client, maxTimeToRunningPhaseSeconds); - disconnectStarted = Number.MAX_SAFE_INTEGER; // Reset disconnect period + disconnectStarted = Number.MAX_SAFE_INTEGER; // Reset disconnect period } catch (e) { if (durationLongerThanSeconds(disconnectStarted, controllerMaxDisconnectSeconds)) { - log.warn("Error while controlling installation's workspaces", e, { installation: this.cluster.name }); + log.warn("Error while controlling installation's workspaces", e, { + installation: this.cluster.name, + }); } else if (disconnectStarted > Date.now()) { disconnectStarted = Date.now(); } } - }, controllerIntervalSeconds * 1000) + }, controllerIntervalSeconds * 1000), ); } - protected async controlInstallationInstances(client: PromisifiedWorkspaceManagerClient, maxTimeToRunningPhaseSeconds: number) { + protected async controlInstallationInstances( + client: PromisifiedWorkspaceManagerClient, + maxTimeToRunningPhaseSeconds: number, + ) { const installation = this.cluster.name; log.debug("controlling instances", { installation }); let ctx: TraceContext = {}; const runningInstances = await this.workspaceDB.trace(ctx).findRunningInstancesWithWorkspaces(installation); const runningInstancesIdx = new Map(); - runningInstances.forEach(i => runningInstancesIdx.set(i.latestInstance.id, i)); + runningInstances.forEach((i) => runningInstancesIdx.set(i.latestInstance.id, i)); const actuallyRunningInstances = await client.getWorkspaces(ctx, new GetWorkspacesRequest()); - actuallyRunningInstances.getStatusList().forEach(s => runningInstancesIdx.delete(s.getId())); + actuallyRunningInstances.getStatusList().forEach((s) => runningInstancesIdx.delete(s.getId())); const promises: Promise[] = []; for (const [instanceId, ri] of runningInstancesIdx.entries()) { const instance = ri.latestInstance; - if (!(instance.status.phase === 'running' || durationLongerThanSeconds(Date.parse(instance.creationTime), maxTimeToRunningPhaseSeconds))) { - log.debug({ instanceId }, "Skipping instance", { phase: instance.status.phase, creationTime: instance.creationTime, region: instance.region }); + if ( + !( + instance.status.phase === "running" || + durationLongerThanSeconds(Date.parse(instance.creationTime), maxTimeToRunningPhaseSeconds) + ) + ) { + log.debug({ instanceId }, "Skipping instance", { + phase: instance.status.phase, + creationTime: instance.creationTime, + region: instance.region, + }); continue; } - log.info({instanceId, workspaceId: instance.workspaceId}, "Database says the instance is starting for too long or running, but wsman does not know about it. Marking as stopped in database.", {installation}); + log.info( + { instanceId, workspaceId: instance.workspaceId }, + "Database says the instance is starting for too long or running, but wsman does not know about it. Marking as stopped in database.", + { installation }, + ); instance.status.phase = "stopped"; instance.stoppingTime = new Date().toISOString(); instance.stoppedTime = new Date().toISOString(); @@ -344,7 +436,12 @@ export class WorkspaceManagerBridge implements Disposable { // probes are an EE feature - we just need the hook here } - protected async updatePrebuiltWorkspace(ctx: TraceContext, userId: string, status: WorkspaceStatus.AsObject, writeToDB: boolean) { + protected async updatePrebuiltWorkspace( + ctx: TraceContext, + userId: string, + status: WorkspaceStatus.AsObject, + writeToDB: boolean, + ) { // prebuilds are an EE feature - we just need the hook here } @@ -352,19 +449,23 @@ export class WorkspaceManagerBridge implements Disposable { // prebuilds are an EE feature - we just need the hook here } - protected async onInstanceStopped(ctx: TraceContext, ownerUserID: string, instance: WorkspaceInstance): Promise { + protected async onInstanceStopped( + ctx: TraceContext, + ownerUserID: string, + instance: WorkspaceInstance, + ): Promise { const span = TraceContext.startSpan("onInstanceStopped", ctx); try { - await this.userDB.trace({span}).deleteGitpodTokensNamedLike(ownerUserID, `${instance.id}-%`); + await this.userDB.trace({ span }).deleteGitpodTokensNamedLike(ownerUserID, `${instance.id}-%`); this.analytics.track({ userId: ownerUserID, event: "workspace_stopped", messageId: `bridge-wsstopped-${instance.id}`, - properties: { "instanceId": instance.id, "workspaceId": instance.workspaceId } + properties: { instanceId: instance.id, workspaceId: instance.workspaceId }, }); } catch (err) { - TraceContext.setError({span}, err); + TraceContext.setError({ span }, err); throw err; } finally { span.finish(); @@ -374,7 +475,6 @@ export class WorkspaceManagerBridge implements Disposable { public dispose() { this.disposables.dispose(); } - } const mapFirstUserActivity = (firstUserActivity: Timestamp | undefined): string | undefined => { @@ -413,4 +513,4 @@ const filterStatus = (status: WorkspaceStatus.AsObject): Partial; protected readonly stalePrebuildEventsTotal: prom.Counter; + protected readonly workspaceInstanceUpdateStartedTotal: prom.Counter; + protected readonly workspaceInstanceUpdateCompletedSeconds: prom.Histogram; + protected activeClusterNames = new Set(); constructor() { this.workspaceStartupTimeHistogram = new prom.Histogram({ - name: 'workspace_startup_time', - help: 'The time until a workspace instance is marked running', - labelNames: ['neededImageBuild', 'region'], + name: "workspace_startup_time", + help: "The time until a workspace instance is marked running", + labelNames: ["neededImageBuild", "region"], buckets: prom.exponentialBuckets(2, 2, 10), }); this.timeToFirstUserActivityHistogram = new prom.Histogram({ - name: 'first_user_activity_time', - help: 'The time between a workspace is running and first user activity', - labelNames: ['region'], + name: "first_user_activity_time", + help: "The time between a workspace is running and first user activity", + labelNames: ["region"], buckets: prom.exponentialBuckets(2, 2, 10), }); this.clusterScore = new prom.Gauge({ - name: 'gitpod_ws_manager_bridge_cluster_score', - help: 'Score of the individual registered workspace cluster', - labelNames: ["workspace_cluster"] + name: "gitpod_ws_manager_bridge_cluster_score", + help: "Score of the individual registered workspace cluster", + labelNames: ["workspace_cluster"], }); this.clusterCordoned = new prom.Gauge({ - name: 'gitpod_ws_manager_bridge_cluster_cordoned', - help: 'Cordoned status of the individual registered workspace cluster', - labelNames: ["workspace_cluster"] + name: "gitpod_ws_manager_bridge_cluster_cordoned", + help: "Cordoned status of the individual registered workspace cluster", + labelNames: ["workspace_cluster"], }); this.statusUpdatesTotal = new prom.Counter({ - name: 'gitpod_ws_manager_bridge_status_updates_total', - help: 'Total workspace status updates received', - labelNames: ["workspace_cluster", "known_instance"] + name: "gitpod_ws_manager_bridge_status_updates_total", + help: "Total workspace status updates received", + labelNames: ["workspace_cluster", "known_instance"], }); this.stalePrebuildEventsTotal = new prom.Counter({ name: "gitpod_ws_manager_bridge_stale_prebuild_events_total", - help: "Total count of stale prebuild events received by workspace manager bridge" - }) + help: "Total count of stale prebuild events received by workspace manager bridge", + }); + + this.workspaceInstanceUpdateStartedTotal = new prom.Counter({ + name: "gitpod_ws_manager_bridge_workspace_instance_update_started_total", + help: "Total number of workspace instance updates that started processing", + labelNames: ["workspace_cluster", "workspace_instance_type"], + }); + + this.workspaceInstanceUpdateCompletedSeconds = new prom.Histogram({ + name: "gitpod_ws_manager_bridge_workspace_instance_update_completed_seconds", + help: "Histogram of completed workspace instance updates, by outcome", + labelNames: ["workspace_cluster", "workspace_instance_type", "outcome"], + buckets: prom.exponentialBuckets(2, 2, 8), + }); } observeWorkspaceStartupTime(instance: WorkspaceInstance): void { - const timeToRunningSecs = (new Date(instance.startedTime!).getTime() - new Date(instance.creationTime).getTime()) / 1000; - this.workspaceStartupTimeHistogram.observe({ - neededImageBuild: JSON.stringify(instance.status.conditions.neededImageBuild), - region: instance.region, - }, timeToRunningSecs); + const timeToRunningSecs = + (new Date(instance.startedTime!).getTime() - new Date(instance.creationTime).getTime()) / 1000; + this.workspaceStartupTimeHistogram.observe( + { + neededImageBuild: JSON.stringify(instance.status.conditions.neededImageBuild), + region: instance.region, + }, + timeToRunningSecs, + ); } observeFirstUserActivity(instance: WorkspaceInstance, firstUserActivity: string): void { @@ -67,22 +88,26 @@ export class PrometheusMetricsExporter { return; } - const timeToFirstUserActivity = (new Date(firstUserActivity).getTime() - new Date(instance.startedTime!).getTime()) / 1000; - this.timeToFirstUserActivityHistogram.observe({ - region: instance.region, - }, timeToFirstUserActivity); + const timeToFirstUserActivity = + (new Date(firstUserActivity).getTime() - new Date(instance.startedTime!).getTime()) / 1000; + this.timeToFirstUserActivityHistogram.observe( + { + region: instance.region, + }, + timeToFirstUserActivity, + ); } updateClusterMetrics(clusters: WorkspaceClusterWoTLS[]): void { let newActiveClusterNames = new Set(); - clusters.forEach(cluster => { - this.clusterCordoned.labels(cluster.name).set(cluster.state === 'cordoned' ? 1 : 0); + clusters.forEach((cluster) => { + this.clusterCordoned.labels(cluster.name).set(cluster.state === "cordoned" ? 1 : 0); this.clusterScore.labels(cluster.name).set(cluster.score); newActiveClusterNames.add(cluster.name); }); - const noLongerActiveCluster = Array.from(this.activeClusterNames).filter(c => !newActiveClusterNames.has(c)); - noLongerActiveCluster.forEach(clusterName => { + const noLongerActiveCluster = Array.from(this.activeClusterNames).filter((c) => !newActiveClusterNames.has(c)); + noLongerActiveCluster.forEach((clusterName) => { this.clusterCordoned.remove(clusterName); this.clusterScore.remove(clusterName); }); @@ -96,5 +121,20 @@ export class PrometheusMetricsExporter { recordStalePrebuildEvent(): void { this.stalePrebuildEventsTotal.inc(); } -} + reportWorkspaceInstanceUpdateStarted(workspaceCluster: string, type: WorkspaceType): void { + this.workspaceInstanceUpdateStartedTotal.labels(workspaceCluster, WorkspaceType[type]); + } + + reportWorkspaceInstanceUpdateCompleted( + durationSeconds: number, + workspaceCluster: string, + type: WorkspaceType, + error?: Error, + ): void { + const outcome = error ? "error" : "success"; + this.workspaceInstanceUpdateCompletedSeconds + .labels(workspaceCluster, WorkspaceType[type], outcome) + .observe(durationSeconds); + } +}