Skip to content

Commit

Permalink
Add counter metrics to record error count
Browse files Browse the repository at this point in the history
Add two metrics to record the `mount device failed` and `cannt mount volume`
error. So, we could know the frequency the error happened.

Signed-off-by: JenTing Hsiao <[email protected]>
  • Loading branch information
jenting authored and roboquat committed Nov 3, 2022
1 parent 0a130f1 commit d686ea9
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 26 deletions.
16 changes: 16 additions & 0 deletions components/ws-manager/pkg/manager/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ type metrics struct {
totalRestoreCounterVec *prometheus.CounterVec
totalRestoreFailureCounterVec *prometheus.CounterVec
totalUnintentionalWorkspaceStopCounterVec *prometheus.CounterVec
totalMountDeviceFailedVec *prometheus.CounterVec
totalCannotMountVolumeVec *prometheus.CounterVec

// Gauge
totalOpenPortGauge prometheus.GaugeFunc
Expand Down Expand Up @@ -142,6 +144,18 @@ func newMetrics(m *Manager) *metrics {
Name: "workspace_unintentional_stop_total",
Help: "total number of workspaces when container stopped without being deleted prior",
}, []string{"type", "class"}),
totalMountDeviceFailedVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_mount_device_failed",
Help: "total number of workspace mount device failed",
}, []string{"type", "class"}),
totalCannotMountVolumeVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_cannot_mount_volume",
Help: "total number of workspace cannot mount volume",
}, []string{"type", "class"}),
totalOpenPortGauge: prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Expand Down Expand Up @@ -205,6 +219,8 @@ func (m *metrics) Register(reg prometheus.Registerer) error {
m.totalRestoreCounterVec,
m.totalRestoreFailureCounterVec,
m.totalUnintentionalWorkspaceStopCounterVec,
m.totalMountDeviceFailedVec,
m.totalCannotMountVolumeVec,
m.totalOpenPortGauge,
}
for _, c := range collectors {
Expand Down
12 changes: 10 additions & 2 deletions components/ws-manager/pkg/manager/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,8 @@ func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspac
// one should extract the phase themselves. If the pod has not failed, this function returns "", nil.
func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.WorkspacePhase) {
pod := wso.Pod
wsType := strings.ToUpper(pod.Labels[wsk8s.TypeLabel])
wsClass := pod.Labels[workspaceClassLabel]

// if the workspace was explicitely marked as failed that also constitutes a failure reason
reason, explicitFailure := pod.Annotations[workspaceExplicitFailAnnotation]
Expand Down Expand Up @@ -590,8 +592,6 @@ func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.Worksp
} else if terminationState.Reason == "Completed" {
// container terminated successfully - this is not a failure
if !isPodBeingDeleted(pod) {
wsType := strings.ToUpper(pod.Labels[wsk8s.TypeLabel])
wsClass := pod.Labels[workspaceClassLabel]
if metrics != nil && !wso.IsWorkspaceHeadless() {
metrics.totalUnintentionalWorkspaceStopCounterVec.WithLabelValues(wsType, wsClass).Inc()
}
Expand All @@ -618,9 +618,17 @@ func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.Worksp
if strings.Contains(evt.Message, "MountVolume.MountDevice failed for volume") {
// ref: https://github.com/gitpod-io/gitpod/issues/13353
// ref: https://github.com/kubernetes-sigs/gcp-compute-persistent-disk-csi-driver/issues/608
log.WithField("pod", pod.Name).Warnf("%s", evt.Message)
if metrics != nil {
metrics.totalMountDeviceFailedVec.WithLabelValues(wsType, wsClass).Inc()
}
return "", nil
} else if strings.Contains(evt.Message, workspaceVolumeName) {
// ref: https://github.com/gitpod-io/gitpod/issues/14032
log.WithField("pod", pod.Name).Warnf("%s", evt.Message)
if metrics != nil {
metrics.totalCannotMountVolumeVec.WithLabelValues(wsType, wsClass).Inc()
}
return "", nil
} else {
// if this happens we did not do a good job because that means we've introduced another volume to the pod
Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,3 @@
{
"actions": [
{
"Func": "markWorkspace",
"Params": {
"annotations": [
{
"Name": "gitpod/failedBeforeStopping",
"Value": "true",
"Delete": false
}
],
"workspaceID": "b3242d9b-6920-41b5-8e72-c3d5637ca148"
}
},
{
"Func": "stopWorkspace",
"Params": {
"gracePeriod": 30000000000,
"workspaceID": "b3242d9b-6920-41b5-8e72-c3d5637ca148"
}
}
]
}
"actions": null
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
},
"phase": 1,
"conditions": {
"failed": "cannot mount workspace",
"volume_snapshot": {}
},
"message": "pod is pending",
Expand Down

0 comments on commit d686ea9

Please sign in to comment.