Skip to content

Commit

Permalink
[ws-man] Add started and completed metrics to track health
Browse files Browse the repository at this point in the history
  • Loading branch information
easyCZ authored and roboquat committed Apr 22, 2022
1 parent d16776c commit 5357c9a
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 2 deletions.
48 changes: 46 additions & 2 deletions components/ws-manager-bridge/src/bridge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import { Configuration } from "./config";
import { WorkspaceCluster } from "@gitpod/gitpod-protocol/lib/workspace-cluster";
import { repeat } from "@gitpod/gitpod-protocol/lib/util/repeat";
import { PreparingUpdateEmulator, PreparingUpdateEmulatorFactory } from "./preparing-update-emulator";
import { performance } from "perf_hooks";

export const WorkspaceManagerBridgeFactory = Symbol("WorkspaceManagerBridgeFactory");

Expand Down Expand Up @@ -166,13 +167,58 @@ export class WorkspaceManagerBridge implements Disposable {
}

protected async handleStatusUpdate(ctx: TraceContext, rawStatus: WorkspaceStatus, writeToDB: boolean) {
const start = performance.now();
const status = rawStatus.toObject();
log.info("Handling WorkspaceStatus update", status);

if (!status.spec || !status.metadata || !status.conditions) {
log.warn("Received invalid status update", status);
return;
}

const logCtx = {
instanceId: status.id!,
workspaceId: status.metadata!.metaId!,
userId: status.metadata!.owner!,
};

try {
this.prometheusExporter.reportWorkspaceInstanceUpdateStarted(
writeToDB,
this.cluster.name,
status.spec.type,
);
await this.statusUpdate(ctx, rawStatus, writeToDB);
} catch (e) {
const durationMs = performance.now() - start;
this.prometheusExporter.reportWorkspaceInstanceUpdateCompleted(
durationMs / 1000,
writeToDB,
this.cluster.name,
status.spec.type,
e,
);
log.error(logCtx, "Failed to complete WorkspaceInstance status update", e);
throw e;
} finally {
const durationMs = performance.now() - start;
this.prometheusExporter.reportWorkspaceInstanceUpdateCompleted(
durationMs / 1000,
writeToDB,
this.cluster.name,
status.spec.type,
);
log.info(logCtx, "Successfully completed WorkspaceInstance status update");
}
}

private async statusUpdate(ctx: TraceContext, rawStatus: WorkspaceStatus, writeToDB: boolean) {
const status = rawStatus.toObject();

if (!status.spec || !status.metadata || !status.conditions) {
return;
}

const span = TraceContext.startSpan("handleStatusUpdate", ctx);
span.setTag("status", JSON.stringify(filterStatus(status)));
span.setTag("writeToDB", writeToDB);
Expand All @@ -183,7 +229,6 @@ export class WorkspaceManagerBridge implements Disposable {
const instanceId = status.id!;
const workspaceId = status.metadata!.metaId!;
const userId = status.metadata!.owner!;
const logCtx = { instanceId, workspaceId, userId };

const instance = await this.workspaceDB.trace({ span }).findInstanceById(instanceId);
if (instance) {
Expand All @@ -194,7 +239,6 @@ export class WorkspaceManagerBridge implements Disposable {
// We ignore this update because we do not have anything to reconcile this update against, but also because we assume it is handled
// by another instance of ws-manager-bridge that is in the region where the WorkspaceInstance record was created.
this.prometheusExporter.statusUpdateReceived(this.cluster.name, false);
log.warn(logCtx, "Received a status update for an unknown instance", { status });
return;
}

Expand Down
38 changes: 38 additions & 0 deletions components/ws-manager-bridge/src/prometheus-metrics-exporter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import * as prom from "prom-client";
import { injectable } from "inversify";
import { WorkspaceInstance } from "@gitpod/gitpod-protocol";
import { WorkspaceClusterWoTLS } from "@gitpod/gitpod-protocol/src/workspace-cluster";
import { WorkspaceType } from "@gitpod/ws-manager/lib/core_pb";

@injectable()
export class PrometheusMetricsExporter {
Expand All @@ -18,6 +19,9 @@ export class PrometheusMetricsExporter {
protected readonly statusUpdatesTotal: prom.Counter<string>;
protected readonly stalePrebuildEventsTotal: prom.Counter<string>;

protected readonly workspaceInstanceUpdateStartedTotal: prom.Counter<string>;
protected readonly workspaceInstanceUpdateCompletedSeconds: prom.Histogram<string>;

protected activeClusterNames = new Set<string>();

constructor() {
Expand Down Expand Up @@ -52,6 +56,23 @@ export class PrometheusMetricsExporter {
name: "gitpod_ws_manager_bridge_stale_prebuild_events_total",
help: "Total count of stale prebuild events received by workspace manager bridge",
});

this.workspaceInstanceUpdateStartedTotal = new prom.Counter({
name: "gitpod_ws_manager_bridge_workspace_instance_update_started_total",
help: "Total number of workspace instance updates that started processing",
// we track db_write because we need to be able to distinguish between outcomes which did affect the system negatively - failed to write,
// and outcomes by read-only replicas.
labelNames: ["db_write", "workspace_cluster", "workspace_instance_type"],
});

this.workspaceInstanceUpdateCompletedSeconds = new prom.Histogram({
name: "gitpod_ws_manager_bridge_workspace_instance_update_completed_seconds",
help: "Histogram of completed workspace instance updates, by outcome",
// we track db_write because we need to be able to distinguish between outcomes which did affect the system negatively - failed to write,
// and outcomes by read-only replicas.
labelNames: ["db_write", "workspace_cluster", "workspace_instance_type", "outcome"],
buckets: prom.exponentialBuckets(2, 2, 8),
});
}

observeWorkspaceStartupTime(instance: WorkspaceInstance): void {
Expand Down Expand Up @@ -104,4 +125,21 @@ export class PrometheusMetricsExporter {
recordStalePrebuildEvent(): void {
this.stalePrebuildEventsTotal.inc();
}

reportWorkspaceInstanceUpdateStarted(dbWrite: boolean, workspaceCluster: string, type: WorkspaceType): void {
this.workspaceInstanceUpdateStartedTotal.labels(String(dbWrite), workspaceCluster, WorkspaceType[type]);
}

reportWorkspaceInstanceUpdateCompleted(
durationSeconds: number,
dbWrite: boolean,
workspaceCluster: string,
type: WorkspaceType,
error?: Error,
): void {
const outcome = error ? "error" : "success";
this.workspaceInstanceUpdateCompletedSeconds
.labels(String(dbWrite), workspaceCluster, WorkspaceType[type], outcome)
.observe(durationSeconds);
}
}

0 comments on commit 5357c9a

Please sign in to comment.