Skip to content

Commit

Permalink
[ws-man] Add started and completed metrics to track health
Browse files Browse the repository at this point in the history
  • Loading branch information
easyCZ committed Apr 21, 2022
1 parent a903877 commit 109d197
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 36 deletions.
42 changes: 40 additions & 2 deletions components/ws-manager-bridge/src/bridge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import { Configuration } from "./config";
import { WorkspaceCluster } from "@gitpod/gitpod-protocol/lib/workspace-cluster";
import { repeat } from "@gitpod/gitpod-protocol/lib/util/repeat";
import { PreparingUpdateEmulator, PreparingUpdateEmulatorFactory } from "./preparing-update-emulator";
import { performance } from "perf_hooks";

export const WorkspaceManagerBridgeFactory = Symbol("WorkspaceManagerBridgeFactory");

Expand Down Expand Up @@ -166,13 +167,52 @@ export class WorkspaceManagerBridge implements Disposable {
}

protected async handleStatusUpdate(ctx: TraceContext, rawStatus: WorkspaceStatus, writeToDB: boolean) {
const start = performance.now();
const status = rawStatus.toObject();
log.info("Handling WorkspaceStatus update", status);

if (!status.spec || !status.metadata || !status.conditions) {
log.warn("Received invalid status update", status);
return;
}

try {
this.prometheusExporter.reportWorkspaceInstanceUpdateStarted(
writeToDB,
this.cluster.name,
status.spec.type,
);
await this.statusUpdate(ctx, rawStatus, writeToDB);
} catch (e) {
const durationMs = performance.now() - start;
this.prometheusExporter.reportWorkspaceInstanceUpdateCompleted(
durationMs / 1000,
writeToDB,
this.cluster.name,
status.spec.type,
e,
);
log.error("Failed to complete WorkspaceInstance status update", e);
throw e;
} finally {
const durationMs = performance.now() - start;
this.prometheusExporter.reportWorkspaceInstanceUpdateCompleted(
durationMs / 1000,
writeToDB,
this.cluster.name,
status.spec.type,
);
log.info("Successfully completed WorkspaceInstance status update");
}
}

private async statusUpdate(ctx: TraceContext, rawStatus: WorkspaceStatus, writeToDB: boolean) {
const status = rawStatus.toObject();

if (!status.spec || !status.metadata || !status.conditions) {
return;
}

const span = TraceContext.startSpan("handleStatusUpdate", ctx);
span.setTag("status", JSON.stringify(filterStatus(status)));
span.setTag("writeToDB", writeToDB);
Expand All @@ -183,7 +223,6 @@ export class WorkspaceManagerBridge implements Disposable {
const instanceId = status.id!;
const workspaceId = status.metadata!.metaId!;
const userId = status.metadata!.owner!;
const logCtx = { instanceId, workspaceId, userId };

const instance = await this.workspaceDB.trace({ span }).findInstanceById(instanceId);
if (instance) {
Expand All @@ -194,7 +233,6 @@ export class WorkspaceManagerBridge implements Disposable {
// We ignore this update because we do not have anything to reconcile this update against, but also because we assume it is handled
// by another instance of ws-manager-bridge that is in the region where the WorkspaceInstance record was created.
this.prometheusExporter.statusUpdateReceived(this.cluster.name, false);
log.warn(logCtx, "Received a status update for an unknown instance", { status });
return;
}

Expand Down
113 changes: 79 additions & 34 deletions components/ws-manager-bridge/src/prometheus-metrics-exporter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
* See License-AGPL.txt in the project root for license information.
*/

import * as prom from 'prom-client';
import * as prom from "prom-client";
import { injectable } from "inversify";
import { WorkspaceInstance } from '@gitpod/gitpod-protocol';
import { WorkspaceClusterWoTLS } from '@gitpod/gitpod-protocol/src/workspace-cluster';
import { WorkspaceInstance } from "@gitpod/gitpod-protocol";
import { WorkspaceClusterWoTLS } from "@gitpod/gitpod-protocol/src/workspace-cluster";
import { WorkspaceType } from "@gitpod/ws-manager/lib/core_pb";

@injectable()
export class PrometheusMetricsExporter {
Expand All @@ -18,71 +19,99 @@ export class PrometheusMetricsExporter {
protected readonly statusUpdatesTotal: prom.Counter<string>;
protected readonly stalePrebuildEventsTotal: prom.Counter<string>;

protected readonly workspaceInstanceUpdateStartedTotal: prom.Counter<string>;
protected readonly workspaceInstanceUpdateCompletedSeconds: prom.Histogram<string>;

protected activeClusterNames = new Set<string>();

constructor() {
this.workspaceStartupTimeHistogram = new prom.Histogram({
name: 'workspace_startup_time',
help: 'The time until a workspace instance is marked running',
labelNames: ['neededImageBuild', 'region'],
name: "workspace_startup_time",
help: "The time until a workspace instance is marked running",
labelNames: ["neededImageBuild", "region"],
buckets: prom.exponentialBuckets(2, 2, 10),
});
this.timeToFirstUserActivityHistogram = new prom.Histogram({
name: 'first_user_activity_time',
help: 'The time between a workspace is running and first user activity',
labelNames: ['region'],
name: "first_user_activity_time",
help: "The time between a workspace is running and first user activity",
labelNames: ["region"],
buckets: prom.exponentialBuckets(2, 2, 10),
});
this.clusterScore = new prom.Gauge({
name: 'gitpod_ws_manager_bridge_cluster_score',
help: 'Score of the individual registered workspace cluster',
labelNames: ["workspace_cluster"]
name: "gitpod_ws_manager_bridge_cluster_score",
help: "Score of the individual registered workspace cluster",
labelNames: ["workspace_cluster"],
});
this.clusterCordoned = new prom.Gauge({
name: 'gitpod_ws_manager_bridge_cluster_cordoned',
help: 'Cordoned status of the individual registered workspace cluster',
labelNames: ["workspace_cluster"]
name: "gitpod_ws_manager_bridge_cluster_cordoned",
help: "Cordoned status of the individual registered workspace cluster",
labelNames: ["workspace_cluster"],
});
this.statusUpdatesTotal = new prom.Counter({
name: 'gitpod_ws_manager_bridge_status_updates_total',
help: 'Total workspace status updates received',
labelNames: ["workspace_cluster", "known_instance"]
name: "gitpod_ws_manager_bridge_status_updates_total",
help: "Total workspace status updates received",
labelNames: ["workspace_cluster", "known_instance"],
});
this.stalePrebuildEventsTotal = new prom.Counter({
name: "gitpod_ws_manager_bridge_stale_prebuild_events_total",
help: "Total count of stale prebuild events received by workspace manager bridge"
})
help: "Total count of stale prebuild events received by workspace manager bridge",
});

this.workspaceInstanceUpdateStartedTotal = new prom.Counter({
name: "gitpod_ws_manager_bridge_workspace_instance_update_started_total",
help: "Total number of workspace instance updates that started processing",
// we track db_write because we need to be able to distinguish between outcomes which did affect the system negatively - failed to write,
// and outcomes by read-only replicas.
labelNames: ["db_write", "workspace_cluster", "workspace_instance_type"],
});

this.workspaceInstanceUpdateCompletedSeconds = new prom.Histogram({
name: "gitpod_ws_manager_bridge_workspace_instance_update_completed_seconds",
help: "Histogram of completed workspace instance updates, by outcome",
// we track db_write because we need to be able to distinguish between outcomes which did affect the system negatively - failed to write,
// and outcomes by read-only replicas.
labelNames: ["db_write", "workspace_cluster", "workspace_instance_type", "outcome"],
buckets: prom.exponentialBuckets(2, 2, 8),
});
}

observeWorkspaceStartupTime(instance: WorkspaceInstance): void {
const timeToRunningSecs = (new Date(instance.startedTime!).getTime() - new Date(instance.creationTime).getTime()) / 1000;
this.workspaceStartupTimeHistogram.observe({
neededImageBuild: JSON.stringify(instance.status.conditions.neededImageBuild),
region: instance.region,
}, timeToRunningSecs);
const timeToRunningSecs =
(new Date(instance.startedTime!).getTime() - new Date(instance.creationTime).getTime()) / 1000;
this.workspaceStartupTimeHistogram.observe(
{
neededImageBuild: JSON.stringify(instance.status.conditions.neededImageBuild),
region: instance.region,
},
timeToRunningSecs,
);
}

observeFirstUserActivity(instance: WorkspaceInstance, firstUserActivity: string): void {
if (!instance.startedTime) {
return;
}

const timeToFirstUserActivity = (new Date(firstUserActivity).getTime() - new Date(instance.startedTime!).getTime()) / 1000;
this.timeToFirstUserActivityHistogram.observe({
region: instance.region,
}, timeToFirstUserActivity);
const timeToFirstUserActivity =
(new Date(firstUserActivity).getTime() - new Date(instance.startedTime!).getTime()) / 1000;
this.timeToFirstUserActivityHistogram.observe(
{
region: instance.region,
},
timeToFirstUserActivity,
);
}

updateClusterMetrics(clusters: WorkspaceClusterWoTLS[]): void {
let newActiveClusterNames = new Set<string>();
clusters.forEach(cluster => {
this.clusterCordoned.labels(cluster.name).set(cluster.state === 'cordoned' ? 1 : 0);
clusters.forEach((cluster) => {
this.clusterCordoned.labels(cluster.name).set(cluster.state === "cordoned" ? 1 : 0);
this.clusterScore.labels(cluster.name).set(cluster.score);
newActiveClusterNames.add(cluster.name);
});

const noLongerActiveCluster = Array.from(this.activeClusterNames).filter(c => !newActiveClusterNames.has(c));
noLongerActiveCluster.forEach(clusterName => {
const noLongerActiveCluster = Array.from(this.activeClusterNames).filter((c) => !newActiveClusterNames.has(c));
noLongerActiveCluster.forEach((clusterName) => {
this.clusterCordoned.remove(clusterName);
this.clusterScore.remove(clusterName);
});
Expand All @@ -96,5 +125,21 @@ export class PrometheusMetricsExporter {
recordStalePrebuildEvent(): void {
this.stalePrebuildEventsTotal.inc();
}
}

reportWorkspaceInstanceUpdateStarted(dbWrite: boolean, workspaceCluster: string, type: WorkspaceType): void {
this.workspaceInstanceUpdateStartedTotal.labels(String(dbWrite), workspaceCluster, WorkspaceType[type]);
}

reportWorkspaceInstanceUpdateCompleted(
durationSeconds: number,
dbWrite: boolean,
workspaceCluster: string,
type: WorkspaceType,
error?: Error,
): void {
const outcome = error ? "error" : "success";
this.workspaceInstanceUpdateCompletedSeconds
.labels(String(dbWrite), workspaceCluster, WorkspaceType[type], outcome)
.observe(durationSeconds);
}
}

0 comments on commit 109d197

Please sign in to comment.