From 9074f678d25ee646754e876f5ddf19e54decfb20 Mon Sep 17 00:00:00 2001 From: Milan Pavlik Date: Wed, 20 Apr 2022 11:54:14 +0000 Subject: [PATCH] [ws-man] Add started and completed metrics to track health --- components/ws-manager-bridge/src/bridge.ts | 48 ++++++++++++++++++- .../src/prometheus-metrics-exporter.ts | 38 +++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/components/ws-manager-bridge/src/bridge.ts b/components/ws-manager-bridge/src/bridge.ts index 5c5f4bf9d4ace8..a289b761da2008 100644 --- a/components/ws-manager-bridge/src/bridge.ts +++ b/components/ws-manager-bridge/src/bridge.ts @@ -36,6 +36,7 @@ import { Configuration } from "./config"; import { WorkspaceCluster } from "@gitpod/gitpod-protocol/lib/workspace-cluster"; import { repeat } from "@gitpod/gitpod-protocol/lib/util/repeat"; import { PreparingUpdateEmulator, PreparingUpdateEmulatorFactory } from "./preparing-update-emulator"; +import { performance } from "perf_hooks"; export const WorkspaceManagerBridgeFactory = Symbol("WorkspaceManagerBridgeFactory"); @@ -166,13 +167,58 @@ export class WorkspaceManagerBridge implements Disposable { } protected async handleStatusUpdate(ctx: TraceContext, rawStatus: WorkspaceStatus, writeToDB: boolean) { + const start = performance.now(); const status = rawStatus.toObject(); log.info("Handling WorkspaceStatus update", status); + if (!status.spec || !status.metadata || !status.conditions) { log.warn("Received invalid status update", status); return; } + const logCtx = { + instanceId: status.id!, + workspaceId: status.metadata!.metaId!, + userId: status.metadata!.owner!, + }; + + try { + this.prometheusExporter.reportWorkspaceInstanceUpdateStarted( + writeToDB, + this.cluster.name, + status.spec.type, + ); + await this.statusUpdate(ctx, rawStatus, writeToDB); + } catch (e) { + const durationMs = performance.now() - start; + this.prometheusExporter.reportWorkspaceInstanceUpdateCompleted( + durationMs / 1000, + writeToDB, + this.cluster.name, + status.spec.type, + e, + ); + log.error(logCtx, "Failed to complete WorkspaceInstance status update", e); + throw e; + } finally { + const durationMs = performance.now() - start; + this.prometheusExporter.reportWorkspaceInstanceUpdateCompleted( + durationMs / 1000, + writeToDB, + this.cluster.name, + status.spec.type, + ); + log.info(logCtx, "Successfully completed WorkspaceInstance status update"); + } + } + + private async statusUpdate(ctx: TraceContext, rawStatus: WorkspaceStatus, writeToDB: boolean) { + const status = rawStatus.toObject(); + + if (!status.spec || !status.metadata || !status.conditions) { + return; + } + const span = TraceContext.startSpan("handleStatusUpdate", ctx); span.setTag("status", JSON.stringify(filterStatus(status))); span.setTag("writeToDB", writeToDB); @@ -183,7 +229,6 @@ export class WorkspaceManagerBridge implements Disposable { const instanceId = status.id!; const workspaceId = status.metadata!.metaId!; const userId = status.metadata!.owner!; - const logCtx = { instanceId, workspaceId, userId }; const instance = await this.workspaceDB.trace({ span }).findInstanceById(instanceId); if (instance) { @@ -194,7 +239,6 @@ export class WorkspaceManagerBridge implements Disposable { // We ignore this update because we do not have anything to reconcile this update against, but also because we assume it is handled // by another instance of ws-manager-bridge that is in the region where the WorkspaceInstance record was created. this.prometheusExporter.statusUpdateReceived(this.cluster.name, false); - log.warn(logCtx, "Received a status update for an unknown instance", { status }); return; } diff --git a/components/ws-manager-bridge/src/prometheus-metrics-exporter.ts b/components/ws-manager-bridge/src/prometheus-metrics-exporter.ts index 8de59ed2a2ce1e..c17b4a3911d56b 100644 --- a/components/ws-manager-bridge/src/prometheus-metrics-exporter.ts +++ b/components/ws-manager-bridge/src/prometheus-metrics-exporter.ts @@ -8,6 +8,7 @@ import * as prom from "prom-client"; import { injectable } from "inversify"; import { WorkspaceInstance } from "@gitpod/gitpod-protocol"; import { WorkspaceClusterWoTLS } from "@gitpod/gitpod-protocol/src/workspace-cluster"; +import { WorkspaceType } from "@gitpod/ws-manager/lib/core_pb"; @injectable() export class PrometheusMetricsExporter { @@ -18,6 +19,9 @@ export class PrometheusMetricsExporter { protected readonly statusUpdatesTotal: prom.Counter; protected readonly stalePrebuildEventsTotal: prom.Counter; + protected readonly workspaceInstanceUpdateStartedTotal: prom.Counter; + protected readonly workspaceInstanceUpdateCompletedSeconds: prom.Histogram; + protected activeClusterNames = new Set(); constructor() { @@ -52,6 +56,23 @@ export class PrometheusMetricsExporter { name: "gitpod_ws_manager_bridge_stale_prebuild_events_total", help: "Total count of stale prebuild events received by workspace manager bridge", }); + + this.workspaceInstanceUpdateStartedTotal = new prom.Counter({ + name: "gitpod_ws_manager_bridge_workspace_instance_update_started_total", + help: "Total number of workspace instance updates that started processing", + // we track db_write because we need to be able to distinguish between outcomes which did affect the system negatively - failed to write, + // and outcomes by read-only replicas. + labelNames: ["db_write", "workspace_cluster", "workspace_instance_type"], + }); + + this.workspaceInstanceUpdateCompletedSeconds = new prom.Histogram({ + name: "gitpod_ws_manager_bridge_workspace_instance_update_completed_seconds", + help: "Histogram of completed workspace instance updates, by outcome", + // we track db_write because we need to be able to distinguish between outcomes which did affect the system negatively - failed to write, + // and outcomes by read-only replicas. + labelNames: ["db_write", "workspace_cluster", "workspace_instance_type", "outcome"], + buckets: prom.exponentialBuckets(2, 2, 8), + }); } observeWorkspaceStartupTime(instance: WorkspaceInstance): void { @@ -104,4 +125,21 @@ export class PrometheusMetricsExporter { recordStalePrebuildEvent(): void { this.stalePrebuildEventsTotal.inc(); } + + reportWorkspaceInstanceUpdateStarted(dbWrite: boolean, workspaceCluster: string, type: WorkspaceType): void { + this.workspaceInstanceUpdateStartedTotal.labels(String(dbWrite), workspaceCluster, WorkspaceType[type]); + } + + reportWorkspaceInstanceUpdateCompleted( + durationSeconds: number, + dbWrite: boolean, + workspaceCluster: string, + type: WorkspaceType, + error?: Error, + ): void { + const outcome = error ? "error" : "success"; + this.workspaceInstanceUpdateCompletedSeconds + .labels(String(dbWrite), workspaceCluster, WorkspaceType[type], outcome) + .observe(durationSeconds); + } }