-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
Copy pathstatus.go
809 lines (707 loc) · 28.5 KB
/
status.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
// Copyright (c) 2020 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.
package manager
import (
"bytes"
"context"
"encoding/json"
"fmt"
"strings"
"time"
"github.com/sirupsen/logrus"
"golang.org/x/xerrors"
"google.golang.org/protobuf/types/known/timestamppb"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"github.com/gitpod-io/gitpod/common-go/kubernetes"
wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/common-go/util"
regapi "github.com/gitpod-io/gitpod/registry-facade/api"
"github.com/gitpod-io/gitpod/ws-manager/api"
)
const (
// containerKilledExitCode is the exit code Kubernetes uses for a container which was killed by the system.
// We expect such containers to be restarted by Kubernetes if they're supposed to be running.
// We never deliberately terminate a container like this.
containerKilledExitCode = 137
// containerUnknownExitCode is the exit code containerd uses if it cannot determine the cause/exit status of
// a stopped container.
containerUnknownExitCode = 255
)
// Scheme is the default instance of runtime.Scheme to which types in the Kubernetes API are already registered.
// Inspired by https://github.com/kubernetes/kubernetes/blob/master/pkg/kubectl/scheme/scheme.go
var scheme = runtime.NewScheme()
// register the core schema
func init() {
err := corev1.AddToScheme(scheme)
if err != nil {
log.WithError(err).Fatal("cannot register Kubernetes core schema - this should never happen")
}
}
// workspaceObjects contains all Kubernetes objects required to compute the status of a workspace
type workspaceObjects struct {
Pod *corev1.Pod `json:"pod"`
Events []corev1.Event `json:"events,omitempty"`
}
// GetOWI produces the owner, workspace, instance tripple that we use for tracing and logging
func (wso *workspaceObjects) GetOWI() logrus.Fields {
if wso.Pod != nil {
return wsk8s.GetOWIFromObject(&wso.Pod.ObjectMeta)
}
return logrus.Fields{}
}
// IsWorkspaceHeadless returns true if the workspace described by these objects is headless
func (wso *workspaceObjects) IsWorkspaceHeadless() bool {
if wso.Pod != nil {
val, ok := wso.Pod.ObjectMeta.Labels[headlessLabel]
return ok && val == "true"
}
return false
}
func (wso *workspaceObjects) WorkspaceType() (api.WorkspaceType, error) {
var meta *metav1.ObjectMeta
if wso.Pod != nil {
meta = &wso.Pod.ObjectMeta
} else {
// we don't know anything about this pod - assume it's a regular pod
return api.WorkspaceType_REGULAR, xerrors.Errorf("cannot determine pod type")
}
lbl, ok := meta.Labels[wsk8s.TypeLabel]
if !ok {
tpe := api.WorkspaceType_REGULAR
log.WithFields(wsk8s.GetOWIFromObject(meta)).WithField("workspaceType", tpe).Info("determining type of legacy pod")
return tpe, nil
}
val, ok := api.WorkspaceType_value[strings.ToUpper(lbl)]
if !ok {
// pod has invalid type label - we assume it's a regular pod
return api.WorkspaceType_REGULAR, xerrors.Errorf("pod has invalid type label: %s", lbl)
}
return api.WorkspaceType(val), nil
}
// WorkspaceID returns the ID of the workspace
func (wso *workspaceObjects) WorkspaceID() (id string, ok bool) {
if wso.Pod != nil {
r, ok := wso.Pod.Annotations[workspaceIDAnnotation]
if ok {
return r, true
}
}
return "", false
}
func (wso *workspaceObjects) WasEverReady() (res bool) {
check := func(a map[string]string) bool {
_, neverReady := a[workspaceNeverReadyAnnotation]
return !neverReady
}
if wso.Pod != nil {
return check(wso.Pod.Annotations)
}
// We assume the pod was ready by default, even if we have nothing to show for it.
// The real world has shown that this produces the more favorable failure modes.
return true
}
// HostName returns the name of the node this workspace is/was deployed to. If this workspace has never been deployed anywhere, HostName returns an empty string.
func (wso *workspaceObjects) NodeName() string {
if wso.Pod == nil {
return ""
}
if wso.Pod.Spec.NodeName != "" {
return wso.Pod.Spec.NodeName
}
if res, ok := wso.Pod.Annotations[nodeNameAnnotation]; ok {
return res
}
return ""
}
func (m *Manager) getWorkspaceObjects(ctx context.Context, pod *corev1.Pod) (*workspaceObjects, error) {
wso := &workspaceObjects{Pod: pod}
err := m.completeWorkspaceObjects(ctx, wso)
if err != nil {
return nil, xerrors.Errorf("getWorkspaceObjects: %w", err)
}
return wso, nil
}
// completeWorkspaceObjects finds the remaining Kubernetes objects based on the pod description
// or pod lifecycle indepedent state.
func (m *Manager) completeWorkspaceObjects(ctx context.Context, wso *workspaceObjects) error {
if wso.Pod == nil {
return xerrors.Errorf("completeWorkspaceObjects: need either pod or lifecycle independent state")
}
if wso.Events == nil {
events, err := m.RawClient.CoreV1().Events(m.Config.Namespace).Search(scheme, wso.Pod)
if err != nil {
return xerrors.Errorf("completeWorkspaceObjects: %w", err)
}
wso.Events = make([]corev1.Event, len(events.Items))
copy(wso.Events, events.Items)
}
return nil
}
func (m *Manager) getWorkspaceStatus(wso workspaceObjects) (*api.WorkspaceStatus, error) {
id, ok := wso.WorkspaceID()
if !ok {
return nil, xerrors.Errorf("cannot get %s annotation from %s", workspaceIDAnnotation, wso.Pod.Name)
}
var status *api.WorkspaceStatus
if wso.Pod == nil {
return nil, xerrors.Errorf("need pod to compute status")
}
// we have a workspace pod - use that to compute the status from scratch
workspaceContainer := getContainer(wso.Pod, "workspace")
if workspaceContainer == nil {
return nil, xerrors.Errorf("workspace pod for %s is degenerate - does not have workspace container", id)
}
wsurl, ok := wso.Pod.Annotations[kubernetes.WorkspaceURLAnnotation]
if !ok {
return nil, xerrors.Errorf("pod %s has no %s annotation", wso.Pod.Name, kubernetes.WorkspaceURLAnnotation)
}
tpe, err := wso.WorkspaceType()
if err != nil {
return nil, err
}
var timeout string
if t := m.Config.Timeouts.RegularWorkspace; t > 0 {
timeout = t.String()
}
if v, ok := wso.Pod.Annotations[customTimeoutAnnotation]; ok {
timeout = v
}
var (
wsImage = workspaceContainer.Image
ideImage string
supervisorImage string
ideImagesLayers []string
)
if ispec, ok := wso.Pod.Annotations[kubernetes.WorkspaceImageSpecAnnotation]; ok {
spec, err := regapi.ImageSpecFromBase64(ispec)
if err != nil {
return nil, xerrors.Errorf("invalid image spec: %w", err)
}
wsImage = spec.BaseRef
ideImage = spec.IdeRef
supervisorImage = spec.SupervisorRef
ideImagesLayers = spec.IdeLayerRef
}
ownerToken, ok := wso.Pod.Annotations[kubernetes.OwnerTokenAnnotation]
if !ok {
log.WithFields(wso.GetOWI()).Warn("pod has no owner token. is this a legacy pod?")
}
admission := api.AdmissionLevel_ADMIT_OWNER_ONLY
if av, ok := api.AdmissionLevel_value[strings.ToUpper(wso.Pod.Annotations[kubernetes.WorkspaceAdmissionAnnotation])]; ok {
admission = api.AdmissionLevel(av)
}
var volumeSnapshotStatus workspaceVolumeSnapshotStatus
if rawVolumeSnapshotStatus, ok := wso.Pod.Annotations[pvcWorkspaceVolumeSnapshotAnnotation]; ok {
err := json.Unmarshal([]byte(rawVolumeSnapshotStatus), &volumeSnapshotStatus)
if err != nil {
return nil, xerrors.Errorf("invalid volume snapshot status: %w", err)
}
}
status = &api.WorkspaceStatus{
Id: id,
StatusVersion: m.clock.Tick(),
Metadata: getWorkspaceMetadata(wso.Pod),
Spec: &api.WorkspaceSpec{
Headless: wso.IsWorkspaceHeadless(),
WorkspaceImage: wsImage,
DeprecatedIdeImage: ideImage,
IdeImage: &api.IDEImage{
WebRef: ideImage,
SupervisorRef: supervisorImage,
},
IdeImageLayers: ideImagesLayers,
Url: wsurl,
Type: tpe,
Timeout: timeout,
Class: wso.Pod.Labels[workspaceClassLabel],
},
Conditions: &api.WorkspaceConditions{
Snapshot: wso.Pod.Annotations[workspaceSnapshotAnnotation],
VolumeSnapshot: &api.VolumeSnapshotInfo{
VolumeSnapshotName: volumeSnapshotStatus.VolumeSnapshotName,
VolumeSnapshotHandle: volumeSnapshotStatus.VolumeSnapshotHandle,
},
},
Runtime: &api.WorkspaceRuntimeInfo{
NodeName: wso.Pod.Spec.NodeName,
PodName: wso.Pod.Name,
NodeIp: wso.Pod.Status.HostIP,
},
Auth: &api.WorkspaceAuthentication{
Admission: admission,
OwnerToken: ownerToken,
},
}
err = m.extractStatusFromPod(status, wso)
if err != nil {
return nil, xerrors.Errorf("cannot get workspace status: %w", err)
}
return status, nil
}
func getContainer(pod *corev1.Pod, name string) *corev1.Container {
for _, c := range pod.Spec.Containers {
if c.Name == name {
return &c
}
}
return nil
}
// getWorkspaceMetadata extracts a workspace's metadata from pod labels
func getWorkspaceMetadata(pod *corev1.Pod) *api.WorkspaceMetadata {
started := timestamppb.New(pod.CreationTimestamp.Time)
annotations := make(map[string]string)
for k, v := range pod.Annotations {
if !strings.HasPrefix(k, workspaceAnnotationPrefix) {
continue
}
annotations[strings.TrimPrefix(k, workspaceAnnotationPrefix)] = v
}
return &api.WorkspaceMetadata{
Owner: pod.ObjectMeta.Labels[wsk8s.OwnerLabel],
MetaId: pod.ObjectMeta.Labels[wsk8s.MetaIDLabel],
StartedAt: started,
Annotations: annotations,
}
}
func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspaceObjects) error {
pod := wso.Pod
result.Spec.ExposedPorts = extractExposedPorts(pod).Ports
// check failure states, i.e. determine value of result.Failed
failure, phase := extractFailure(wso, m.metrics)
result.Conditions.Failed = failure
if phase != nil {
result.Phase = *phase
return nil
}
if reason, timedout := pod.Annotations[workspaceTimedOutAnnotation]; timedout {
if reason == "" {
reason = "workspace timed out for an unknown reason"
}
result.Conditions.Timeout = reason
}
if _, sbr := pod.Annotations[stoppedByRequestAnnotation]; sbr {
result.Conditions.StoppedByRequest = api.WorkspaceConditionBool_TRUE
}
if _, abr := pod.Annotations[abortRequestAnnotation]; abr {
result.Conditions.Aborted = api.WorkspaceConditionBool_TRUE
}
if wso.IsWorkspaceHeadless() {
for _, cs := range pod.Status.ContainerStatuses {
if cs.State.Terminated != nil && cs.State.Terminated.Message != "" {
result.Conditions.HeadlessTaskFailed = cs.State.Terminated.Message
break
}
}
}
// if ws-manager is still attempting to create workspace pod, keep status in pending
// pod might get deleted several times if we cannot schedule it on the node
if _, atc := pod.Annotations[attemptingToCreatePodAnnotation]; atc {
result.Phase = api.WorkspacePhase_PENDING
return nil
}
if isPodBeingDeleted(pod) {
result.Phase = api.WorkspacePhase_STOPPING
var hasFinalizer bool
for _, f := range wso.Pod.Finalizers {
if f == gitpodFinalizerName {
hasFinalizer = true
break
}
}
if !hasFinalizer {
// We do this independently of the dispostal status because pods only get their finalizer
// once they're running. If they fail before they reach the running phase we'll never see
// a disposal status, hence would never stop the workspace.
result.Phase = api.WorkspacePhase_STOPPED
}
if rawDisposalStatus, ok := pod.Annotations[disposalStatusAnnotation]; ok {
var ds workspaceDisposalStatus
err := json.Unmarshal([]byte(rawDisposalStatus), &ds)
if err != nil {
return err
}
if ds.Status.IsDisposed() {
if ds.BackupFailure == "" {
result.Conditions.FinalBackupComplete = api.WorkspaceConditionBool_TRUE
}
// Finalizer or not - once the backup is complete we consider the pod stopped.
// Once the finalizer is removed, there's no guarantee we see the pod again, because it might be
// deleted too quickly for us to handle. Hence, we consider the workspace stoppped once the backup
// is done, even though the finalizer might still be present.
//
// This runs the risk that a pod might still be present, but is considered stopped.
result.Phase = api.WorkspacePhase_STOPPED
}
result.Repo = ds.GitStatus
// if the final backup has failed we need to tell the world (if we haven't done so already)
if ds.BackupFailure != "" && !strings.Contains(result.Conditions.Failed, "last backup failed") {
if result.Conditions.Failed != "" {
result.Conditions.Failed += "; "
}
result.Conditions.Failed += fmt.Sprintf("last backup failed: %s.", ds.BackupFailure)
}
}
return nil
}
status := pod.Status
if status.Phase == corev1.PodPending {
// check if any container is still pulling images
for _, cs := range status.ContainerStatuses {
if cs.State.Waiting != nil {
if cs.State.Waiting.Reason != "ContainerCreating" {
continue
}
if wso.WasEverReady() {
// Workspace was ready at some point but has become unready. This should never happen due to the
// very conservative readiness probe. If it does happen however, we don't want to fall back to a
// creating phase.
log.WithField("pod", pod).Warn("once ready pod became unready - this should not happen")
continue
}
result.Phase = api.WorkspacePhase_CREATING
result.Conditions.PullingImages = api.WorkspaceConditionBool_TRUE
result.Message = "containers are being created"
return nil
}
}
result.Phase = api.WorkspacePhase_PENDING
result.Message = "pod is pending"
return nil
} else if status.Phase == corev1.PodRunning {
if firstUserActivity, ok := wso.Pod.Annotations[firstUserActivityAnnotation]; ok {
t, err := time.Parse(time.RFC3339Nano, firstUserActivity)
if err != nil {
return xerrors.Errorf("cannot parse firstUserActivity: %w", err)
}
pt := timestamppb.New(t)
result.Conditions.FirstUserActivity = pt
}
for _, cs := range status.ContainerStatuses {
// containers that were terminated are not ready, but may have been
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode == containerUnknownExitCode {
// the container was stopped for an unknown reason.
// this means that the workspace is currently interrupted.
result.Phase = api.WorkspacePhase_INTERRUPTED
// was it caused by a network outage?
if hasNetworkNotReadyEvent(wso) {
// this might be a false positive in case the workspace recovers but has an exit code 255 for
// another reason afterwards. As of now we're lagging the data to handle it better.
result.Conditions.NetworkNotReady = api.WorkspaceConditionBool_TRUE
result.Message = "container network not ready - workspace should recover shortly"
} else {
result.Message = fmt.Sprintf("container %s was terminated unexpectedly - workspace should recover", cs.Name)
}
return nil
}
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode == containerKilledExitCode {
// we have a container which was killed with exit code 137 recently and is (hopefully) restarting at the moment.
// this means that the workspace is currently interrupted.
result.Phase = api.WorkspacePhase_INTERRUPTED
result.Message = fmt.Sprintf("container %s was terminated unexpectedly - workspace should recover", cs.Name)
return nil
}
if !cs.Ready && cs.RestartCount > 0 {
// this container was running before, but is currently recovering from an interruption
result.Phase = api.WorkspacePhase_INTERRUPTED
result.Message = fmt.Sprintf("container %s was terminated unexpectedly - workspace is recovering", cs.Name)
return nil
}
}
tpe, err := wso.WorkspaceType()
if err != nil {
log.WithError(err).Warn("cannot determine workspace type - assuming this is a regular")
tpe = api.WorkspaceType_REGULAR
}
if wso.IsWorkspaceHeadless() && tpe != api.WorkspaceType_PREBUILD {
// headless workspaces (except prebuilds) don't expose a public service and thus cannot be asked about their status.
// once kubernetes reports the workspace running, so do we.
result.Phase = api.WorkspacePhase_RUNNING
return nil
}
if _, neverReady := pod.Annotations[workspaceNeverReadyAnnotation]; !neverReady {
// workspcae has been marked ready by a workspace-ready probe of the monitor
result.Phase = api.WorkspacePhase_RUNNING
return nil
}
// workspace has not yet been marked ready by one of monitor's probes. It must be initializing then.
result.Phase = api.WorkspacePhase_INITIALIZING
result.Message = "workspace initializer is running"
return nil
} else if isCompletedHeadless(&wso) {
result.Phase = api.WorkspacePhase_STOPPING
result.Message = "headless workspace is stopping"
return nil
} else if status.Phase == corev1.PodSucceeded {
result.Phase = api.WorkspacePhase_STOPPING
result.Message = "workspace is stopping"
return nil
} else if status.Phase == corev1.PodUnknown {
result.Phase = api.WorkspacePhase_UNKNOWN
result.Message = "Kubernetes reports workspace phase as unknown"
return nil
}
// If we've extracted a failure reason earlier, but no explicit phase at the time, as well as in this
// function, we resort to unknown.
// This is different to being unable to determine the workspace phase - the phase is unknown due to an unknown failure (by definition).
if failure != "" {
result.Phase = api.WorkspacePhase_UNKNOWN
return nil
}
log.WithFields(wsk8s.GetOWIFromObject(&pod.ObjectMeta)).WithField("pod", pod).Error("cannot determine workspace phase")
result.Phase = api.WorkspacePhase_UNKNOWN
result.Message = "cannot determine workspace phase. We should never get here."
return nil
}
// extractFailure returns a pod failure reason and possibly a phase. If phase is nil then
// one should extract the phase themselves. If the pod has not failed, this function returns "", nil.
func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.WorkspacePhase) {
pod := wso.Pod
// if the workspace was explicitely marked as failed that also constitutes a failure reason
reason, explicitFailure := pod.Annotations[workspaceExplicitFailAnnotation]
if explicitFailure {
return reason, nil
}
status := pod.Status
if status.Phase == corev1.PodFailed && (status.Reason != "" || status.Message != "") {
// Don't force the phase to UNKNONWN here to leave a chance that we may detect the actual phase of
// the workspace, e.g. stopping.
return fmt.Sprintf("%s: %s", status.Reason, status.Message), nil
}
for _, cs := range status.ContainerStatuses {
if cs.State.Waiting != nil {
if cs.State.Waiting.Reason == "ImagePullBackOff" || cs.State.Waiting.Reason == "ErrImagePull" {
// If the image pull failed we were definitely in the api.WorkspacePhase_CREATING phase,
// unless of course this pod has been deleted already.
var res *api.WorkspacePhase
if isPodBeingDeleted(pod) {
// The pod is being deleted already and we have to decide the phase based on the presence of the
// finalizer and disposal status annotation. That code already exists in the remainder of getStatus,
// hence we defer the decision.
res = nil
} else {
c := api.WorkspacePhase_CREATING
res = &c
}
return fmt.Sprintf("cannot pull image: %s", cs.State.Waiting.Message), res
}
}
terminationState := cs.State.Terminated
if terminationState == nil {
terminationState = cs.LastTerminationState.Terminated
}
if terminationState != nil {
// a workspace terminated container is not neccesarily bad. During shutdown workspaces containers
// can go in this state and that's ok. However, if the workspace was shutting down due to deletion,
// we would not be here as we've checked for a DeletionTimestamp prior. So let's find out why the
// container is terminating.
if terminationState.ExitCode != 0 && terminationState.Message != "" {
var phase *api.WorkspacePhase
if !isPodBeingDeleted(pod) {
// If the wrote a termination message and is not currently being deleted,
// then it must have been/be running. If we did not force the phase here,
// we'd be in unknown.
c := api.WorkspacePhase_RUNNING
phase = &c
}
// the container itself told us why it was terminated - use that as failure reason
return extractFailureFromLogs([]byte(terminationState.Message)), phase
} else if terminationState.Reason == "Error" {
if !isPodBeingDeleted(pod) && terminationState.ExitCode != containerKilledExitCode {
phase := api.WorkspacePhase_RUNNING
return fmt.Sprintf("container %s ran with an error: exit code %d", cs.Name, terminationState.ExitCode), &phase
}
} else if terminationState.Reason == "Completed" {
// container terminated successfully - this is not a failure
if !isPodBeingDeleted(pod) {
wsType := strings.ToUpper(pod.Labels[wsk8s.TypeLabel])
wsClass := pod.Labels[workspaceClassLabel]
if metrics != nil && !wso.IsWorkspaceHeadless() {
metrics.totalUnintentionalWorkspaceStopCounterVec.WithLabelValues(wsType, wsClass).Inc()
}
}
return "", nil
} else if !isPodBeingDeleted(pod) && terminationState.ExitCode != containerUnknownExitCode {
// if a container is terminated and it wasn't because of either:
// - regular shutdown
// - the exit code "UNKNOWN" (which might be caused by an intermittent issue and is handled in extractStatusFromPod)
// - another known error
// then we report it as UNKNOWN
res := api.WorkspacePhase_UNKNOWN
return fmt.Sprintf("workspace container %s terminated for an unknown reason: (%s) %s", cs.Name, terminationState.Reason, terminationState.Message), &res
}
}
}
// some failure conditions are only evident from the "events" that kubernetes stores, e.g. events coming from the kubelet
for _, evt := range wso.Events {
if evt.Reason != "FailedMount" {
continue
}
if strings.Contains(evt.Message, "MountVolume.MountDevice failed for volume") {
// ref: https://github.com/gitpod-io/gitpod/issues/13353
// ref: https://github.com/kubernetes-sigs/gcp-compute-persistent-disk-csi-driver/issues/608
return "", nil
} else if strings.Contains(evt.Message, workspaceVolumeName) {
// ideally we do not just use evt.Message as failure reason because it contains internal paths and is not useful for the user
return "cannot mount workspace", nil
} else {
// if this happens we did not do a good job because that means we've introduced another volume to the pod
// but did not consider that mounting it might fail.
return evt.Message, nil
}
}
return "", nil
}
// extractFailureFromLogs attempts to extract the last error message from a workspace
// container's log output.
func extractFailureFromLogs(logs []byte) string {
var sep = []byte("\n")
var msg struct {
Error string `json:"error"`
Message string `json:"message"`
}
var nidx int
for idx := bytes.LastIndex(logs, sep); idx > 0; idx = nidx {
nidx = bytes.LastIndex(logs[:idx], sep)
if nidx < 0 {
nidx = 0
}
line := logs[nidx:idx]
err := json.Unmarshal(line, &msg)
if err != nil {
continue
}
if msg.Message == "" {
continue
}
if msg.Error == "" {
return msg.Message
}
return msg.Message + ": " + msg.Error
}
return string(logs)
}
// isPodBeingDeleted returns true if the pod is currently being deleted
func isPodBeingDeleted(pod *corev1.Pod) bool {
// if the pod is being deleted the only marker we have is that the deletionTimestamp is set
return pod.ObjectMeta.DeletionTimestamp != nil
}
// isCompletedHeadless returns true if the pod is a headless workspace and either succeeded or failed (e.g., ran to completion)
func isCompletedHeadless(wso *workspaceObjects) bool {
return wso.IsWorkspaceHeadless() && (wso.Pod.Status.Phase == corev1.PodSucceeded || wso.Pod.Status.Phase == corev1.PodFailed)
}
type activity string
const (
activityInit activity = "initialization"
activityStartup activity = "startup"
activityCreatingContainers activity = "creating containers"
activityPullingImages activity = "pulling images"
activityRunningHeadless activity = "running the headless workspace"
activityNone activity = "period of inactivity"
activityMaxLifetime activity = "maximum lifetime"
activityClosed activity = "after being closed"
activityInterrupted activity = "workspace interruption"
activityStopping activity = "stopping"
activityBackup activity = "backup"
)
// isWorkspaceTimedOut determines if a workspace is timed out based on the manager configuration and state the pod is in.
// This function does NOT use the workspaceTimedoutAnnotation, but rather is used to set that annotation in the first place.
func (m *Manager) isWorkspaceTimedOut(wso workspaceObjects) (reason string, err error) {
workspaceID, ok := wso.WorkspaceID()
if !ok {
return "", xerrors.Errorf("workspace has no %s annotation", workspaceIDAnnotation)
}
status, err := m.getWorkspaceStatus(wso)
if err != nil {
return "", xerrors.Errorf("cannot determine workspace phase: %w", err)
}
phase := status.Phase
decide := func(start time.Time, timeout util.Duration, activity activity) (string, error) {
td := time.Duration(timeout)
inactivity := time.Since(start)
if inactivity < td {
return "", nil
}
return fmt.Sprintf("workspace timed out after %s (%s) took longer than %s", activity, formatDuration(inactivity), formatDuration(td)), nil
}
start := wso.Pod.ObjectMeta.CreationTimestamp.Time
lastActivity := m.getWorkspaceActivity(workspaceID)
_, isClosed := wso.Pod.Annotations[workspaceClosedAnnotation]
switch phase {
case api.WorkspacePhase_PENDING:
return decide(start, m.Config.Timeouts.Initialization, activityInit)
case api.WorkspacePhase_INITIALIZING:
return decide(start, m.Config.Timeouts.TotalStartup, activityStartup)
case api.WorkspacePhase_CREATING:
activity := activityCreatingContainers
if status.Conditions.PullingImages == api.WorkspaceConditionBool_TRUE {
activity = activityPullingImages
}
return decide(start, m.Config.Timeouts.TotalStartup, activity)
case api.WorkspacePhase_RUNNING:
// First check is always for the max lifetime
if msg, err := decide(start, m.Config.Timeouts.MaxLifetime, activityMaxLifetime); msg != "" {
return msg, err
}
timeout := m.Config.Timeouts.RegularWorkspace
activity := activityNone
if wso.IsWorkspaceHeadless() {
timeout = m.Config.Timeouts.HeadlessWorkspace
lastActivity = &start
activity = activityRunningHeadless
} else if lastActivity == nil {
// the workspace is up and running, but the user has never produced any activity
return decide(start, m.Config.Timeouts.TotalStartup, activityNone)
} else if isClosed {
return decide(*lastActivity, m.Config.Timeouts.AfterClose, activityClosed)
}
if ctv, ok := wso.Pod.Annotations[customTimeoutAnnotation]; ok {
if ct, err := time.ParseDuration(ctv); err == nil {
timeout = util.Duration(ct)
} else {
log.WithError(err).WithField("customTimeout", ctv).WithFields(wsk8s.GetOWIFromObject(&wso.Pod.ObjectMeta)).Warn("pod had custom timeout annotation set, but could not parse its value. Defaulting to ws-manager config.")
}
}
return decide(*lastActivity, timeout, activity)
case api.WorkspacePhase_INTERRUPTED:
if lastActivity == nil {
// the workspace is up and running, but the user has never produced any activity
return decide(start, m.Config.Timeouts.Interrupted, activityInterrupted)
}
return decide(*lastActivity, m.Config.Timeouts.Interrupted, activityInterrupted)
case api.WorkspacePhase_STOPPING:
if isPodBeingDeleted(wso.Pod) && status.Conditions.FinalBackupComplete != api.WorkspaceConditionBool_TRUE {
// Beware: we apply the ContentFinalization timeout only to workspaces which are currently being deleted.
// We basically don't expect a workspace to be in content finalization before it's been deleted.
return decide(wso.Pod.DeletionTimestamp.Time, m.Config.Timeouts.ContentFinalization, activityBackup)
} else if !isPodBeingDeleted(wso.Pod) {
// pods that have not been deleted have never timed out
return "", nil
} else {
return decide(wso.Pod.DeletionTimestamp.Time, m.Config.Timeouts.Stopping, activityStopping)
}
default:
// the only other phases we can be in is stopped which is pointless to time out
return "", nil
}
}
// hasNetworkNotReadyEvent determines if a workspace experienced a network outage - now, or any time in the past - based on
// its kubernetes events
func hasNetworkNotReadyEvent(wso workspaceObjects) bool {
for _, evt := range wso.Events {
if evt.Reason == "NetworkNotReady" {
return true
}
}
return false
}
func formatDuration(d time.Duration) string {
d = d.Round(time.Minute)
h := d / time.Hour
d -= h * time.Hour
m := d / time.Minute
return fmt.Sprintf("%02dh%02dm", h, m)
}