diff --git a/components/ws-manager/pkg/manager/metrics.go b/components/ws-manager/pkg/manager/metrics.go index 7cdd9fa6f64eee..f83d9abb6bf8da 100644 --- a/components/ws-manager/pkg/manager/metrics.go +++ b/components/ws-manager/pkg/manager/metrics.go @@ -52,6 +52,8 @@ type metrics struct { totalRestoreCounterVec *prometheus.CounterVec totalRestoreFailureCounterVec *prometheus.CounterVec totalUnintentionalWorkspaceStopCounterVec *prometheus.CounterVec + totalMountDeviceFailedVec *prometheus.CounterVec + totalCannotMountVolumeVec *prometheus.CounterVec // Gauge totalOpenPortGauge prometheus.GaugeFunc @@ -142,6 +144,18 @@ func newMetrics(m *Manager) *metrics { Name: "workspace_unintentional_stop_total", Help: "total number of workspaces when container stopped without being deleted prior", }, []string{"type", "class"}), + totalMountDeviceFailedVec: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: "workspace_mount_device_failed", + Help: "total number of workspace mount device failed", + }, []string{"type", "class"}), + totalCannotMountVolumeVec: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: "workspace_cannot_mount_volume", + Help: "total number of workspace cannot mount volume", + }, []string{"type", "class"}), totalOpenPortGauge: prometheus.NewGaugeFunc(prometheus.GaugeOpts{ Namespace: metricsNamespace, Subsystem: metricsWorkspaceSubsystem, @@ -205,6 +219,8 @@ func (m *metrics) Register(reg prometheus.Registerer) error { m.totalRestoreCounterVec, m.totalRestoreFailureCounterVec, m.totalUnintentionalWorkspaceStopCounterVec, + m.totalMountDeviceFailedVec, + m.totalCannotMountVolumeVec, m.totalOpenPortGauge, } for _, c := range collectors { diff --git a/components/ws-manager/pkg/manager/status.go b/components/ws-manager/pkg/manager/status.go index 9d5d556aa396d0..da378bfd564b92 100644 --- a/components/ws-manager/pkg/manager/status.go +++ b/components/ws-manager/pkg/manager/status.go @@ -528,6 +528,8 @@ func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspac // one should extract the phase themselves. If the pod has not failed, this function returns "", nil. func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.WorkspacePhase) { pod := wso.Pod + wsType := strings.ToUpper(pod.Labels[wsk8s.TypeLabel]) + wsClass := pod.Labels[workspaceClassLabel] // if the workspace was explicitely marked as failed that also constitutes a failure reason reason, explicitFailure := pod.Annotations[workspaceExplicitFailAnnotation] @@ -590,8 +592,6 @@ func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.Worksp } else if terminationState.Reason == "Completed" { // container terminated successfully - this is not a failure if !isPodBeingDeleted(pod) { - wsType := strings.ToUpper(pod.Labels[wsk8s.TypeLabel]) - wsClass := pod.Labels[workspaceClassLabel] if metrics != nil && !wso.IsWorkspaceHeadless() { metrics.totalUnintentionalWorkspaceStopCounterVec.WithLabelValues(wsType, wsClass).Inc() } @@ -618,9 +618,17 @@ func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.Worksp if strings.Contains(evt.Message, "MountVolume.MountDevice failed for volume") { // ref: https://github.com/gitpod-io/gitpod/issues/13353 // ref: https://github.com/kubernetes-sigs/gcp-compute-persistent-disk-csi-driver/issues/608 + log.WithField("pod", pod.Name).Warnf("%s", evt.Message) + if metrics != nil { + metrics.totalMountDeviceFailedVec.WithLabelValues(wsType, wsClass).Inc() + } return "", nil } else if strings.Contains(evt.Message, workspaceVolumeName) { // ref: https://github.com/gitpod-io/gitpod/issues/14032 + log.WithField("pod", pod.Name).Warnf("%s", evt.Message) + if metrics != nil { + metrics.totalCannotMountVolumeVec.WithLabelValues(wsType, wsClass).Inc() + } return "", nil } else { // if this happens we did not do a good job because that means we've introduced another volume to the pod diff --git a/components/ws-manager/pkg/manager/testdata/actOnPodEvent_failedWorkspaceMount_PENDING00.golden b/components/ws-manager/pkg/manager/testdata/actOnPodEvent_failedWorkspaceMount_PENDING00.golden index 0b8b1bb6692f23..5981ded50b76f0 100644 --- a/components/ws-manager/pkg/manager/testdata/actOnPodEvent_failedWorkspaceMount_PENDING00.golden +++ b/components/ws-manager/pkg/manager/testdata/actOnPodEvent_failedWorkspaceMount_PENDING00.golden @@ -1,24 +1,3 @@ { - "actions": [ - { - "Func": "markWorkspace", - "Params": { - "annotations": [ - { - "Name": "gitpod/failedBeforeStopping", - "Value": "true", - "Delete": false - } - ], - "workspaceID": "b3242d9b-6920-41b5-8e72-c3d5637ca148" - } - }, - { - "Func": "stopWorkspace", - "Params": { - "gracePeriod": 30000000000, - "workspaceID": "b3242d9b-6920-41b5-8e72-c3d5637ca148" - } - } - ] -} \ No newline at end of file + "actions": null +} diff --git a/components/ws-manager/pkg/manager/testdata/status_failedWorkspaceMount_PENDING00.golden b/components/ws-manager/pkg/manager/testdata/status_failedWorkspaceMount_PENDING00.golden index 5e4a6756d4826b..e7ff70ae680d9f 100644 --- a/components/ws-manager/pkg/manager/testdata/status_failedWorkspaceMount_PENDING00.golden +++ b/components/ws-manager/pkg/manager/testdata/status_failedWorkspaceMount_PENDING00.golden @@ -16,7 +16,6 @@ }, "phase": 1, "conditions": { - "failed": "cannot mount workspace", "volume_snapshot": {} }, "message": "pod is pending",