From b9e2cd342e9dcfac317e6b921a7744e480122c9b Mon Sep 17 00:00:00 2001 From: odubajDT <93584209+odubajDT@users.noreply.github.com> Date: Mon, 4 Mar 2024 15:04:37 +0100 Subject: [PATCH] feat(lifecycle-operator): adapt WorkloadVersionReconciler logic to use ObservabilityTimeout for workload deployment (#3160) Signed-off-by: odubajDT Signed-off-by: vickysomtee --- Makefile | 1 + .../lifecycle-operator/deployment-flow.md | 11 ++++ .../lifecycle-operator/keptn-apps.md | 11 ++++ docs/docs/getting-started/observability.md | 1 + docs/docs/guides/otel.md | 14 +++++ .../v1beta1/keptnworkloadversion_types.go | 8 +++ .../keptnworkloadversion_types_test.go | 3 ++ .../keptnworkloadversion/controller_test.go | 54 +++++++++++++++++++ .../reconcile_deploymentstate.go | 29 +++++++++- .../verify-keptnconfig.sh | 0 .../chainsaw-test.yaml | 4 +- .../timeout-failure-deployment/00-assert.yaml | 50 +++++++++++++++++ .../00-install.yaml | 28 ++++++++++ .../chainsaw-test.yaml | 31 +++++++++++ .../keptnconfig-default.yaml | 6 +++ .../keptnconfig.yaml | 6 +++ 16 files changed, 253 insertions(+), 4 deletions(-) rename test/chainsaw/{non-blocking-deployment => common}/verify-keptnconfig.sh (100%) create mode 100644 test/chainsaw/timeout-failure-deployment/00-assert.yaml create mode 100644 test/chainsaw/timeout-failure-deployment/00-install.yaml create mode 100755 test/chainsaw/timeout-failure-deployment/chainsaw-test.yaml create mode 100644 test/chainsaw/timeout-failure-deployment/keptnconfig-default.yaml create mode 100644 test/chainsaw/timeout-failure-deployment/keptnconfig.yaml diff --git a/Makefile b/Makefile index c667f5f2f6f..6e88abaaba8 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,7 @@ integration-test: chainsaw test --test-dir ./test/chainsaw/testanalysis/ chainsaw test --test-dir ./test/chainsaw/testcertificate/ chainsaw test --test-dir ./test/chainsaw/non-blocking-deployment/ + chainsaw test --test-dir ./test/chainsaw/timeout-failure-deployment/ .PHONY: integration-test-local #these tests should run on a real cluster! integration-test-local: diff --git a/docs/docs/components/lifecycle-operator/deployment-flow.md b/docs/docs/components/lifecycle-operator/deployment-flow.md index 8788e9b4e70..4759b00360e 100644 --- a/docs/docs/components/lifecycle-operator/deployment-flow.md +++ b/docs/docs/components/lifecycle-operator/deployment-flow.md @@ -122,6 +122,17 @@ If any of these activities fail, the `KeptnApp` issues the `AppDeployErrored` event and terminates the deployment. +> **Note** +By default Keptn observes the state of the Kubernetes workloads +for 5 minutes. +After this timeout is exceeded, the deployment phase (from Keptn +viewpoint) is considered as `Failed` and Keptn does not proceed +with post-deployment phases (tasks, evaluations or promotion phase). +This timeout can be modified for the cluster by changing the value +of the `observabilityTimeout` field in the +[KeptnConfig](../../reference/crd-reference/config.md) +resource. + ```shell AppDeploy AppDeployStarted diff --git a/docs/docs/components/lifecycle-operator/keptn-apps.md b/docs/docs/components/lifecycle-operator/keptn-apps.md index 266737f48f0..2c6f6ac01b4 100644 --- a/docs/docs/components/lifecycle-operator/keptn-apps.md +++ b/docs/docs/components/lifecycle-operator/keptn-apps.md @@ -28,6 +28,17 @@ The `KeptnWorkload` resources are created automatically and without delay by the mutating webhook as soon as the workload manifest is applied. +> **Note** +By default Keptn observes the state of the Kubernetes workloads +for 5 minutes. +After this timeout is exceeded, the deployment phase (from Keptn +viewpoint) is considered as `Failed` and Keptn does not proceed +with post-deployment phases (tasks, evaluations or promotion phase). +This timeout can be modified for the cluster by changing the value +of the `observabilityTimeout` field in the +[KeptnConfig](../../reference/crd-reference/config.md) +resource. + ## Keptn Applications A [KeptnApp](../../reference/crd-reference/app.md) diff --git a/docs/docs/getting-started/observability.md b/docs/docs/getting-started/observability.md index f8a9aa86c32..6fa975082ef 100644 --- a/docs/docs/getting-started/observability.md +++ b/docs/docs/getting-started/observability.md @@ -74,6 +74,7 @@ metadata: spec: OTelCollectorUrl: 'jaeger-collector.keptn-system.svc.cluster.local:4317' keptnAppCreationRequestTimeoutSeconds: 30 + observabilityTimeout: 5m ``` Apply the file and wait for Keptn to pick up the new configuration: diff --git a/docs/docs/guides/otel.md b/docs/docs/guides/otel.md index 4225ec5449a..b469de55c4a 100644 --- a/docs/docs/guides/otel.md +++ b/docs/docs/guides/otel.md @@ -161,6 +161,20 @@ kubectl port-forward deployment/metrics-operator 9999 -n keptn-system You can access the metrics from your browser at: `http://localhost:9999` +## Define timeout for workload observability + +There are situations when the deployment of the application fails due to +various reasons (e.g. container image not found). +By default Keptn observes the state of the Kubernetes workloads +for 5 minutes. +After this timeout is exceeded, the deployment phase (from Keptn +viewpoint) is considered as `Failed` and Keptn does not proceed +with post-deployment phases (tasks, evaluations or promotion phase). +This timeout can be modified for the cluster by changing the value +of the `observabilityTimeout` field in the +[KeptnConfig](../reference/crd-reference/config.md) +resource. + ## Advanced tracing configurations in Keptn: Linking traces In Keptn you can connect multiple traces, for instance to connect deployments diff --git a/lifecycle-operator/apis/lifecycle/v1beta1/keptnworkloadversion_types.go b/lifecycle-operator/apis/lifecycle/v1beta1/keptnworkloadversion_types.go index 5f32491c53d..a8213a3d6fe 100644 --- a/lifecycle-operator/apis/lifecycle/v1beta1/keptnworkloadversion_types.go +++ b/lifecycle-operator/apis/lifecycle/v1beta1/keptnworkloadversion_types.go @@ -521,3 +521,11 @@ func (w KeptnWorkloadVersion) GetEventAnnotations() map[string]string { "workloadVersionName": w.Name, } } + +func (w *KeptnWorkloadVersion) SetDeploymentStartTime() { + w.Status.DeploymentStartTime = metav1.NewTime(time.Now().UTC()) +} + +func (w *KeptnWorkloadVersion) IsDeploymentStartTimeSet() bool { + return !w.Status.DeploymentStartTime.IsZero() +} diff --git a/lifecycle-operator/apis/lifecycle/v1beta1/keptnworkloadversion_types_test.go b/lifecycle-operator/apis/lifecycle/v1beta1/keptnworkloadversion_types_test.go index f3f7f4d5a1c..50029e38ba6 100644 --- a/lifecycle-operator/apis/lifecycle/v1beta1/keptnworkloadversion_types_test.go +++ b/lifecycle-operator/apis/lifecycle/v1beta1/keptnworkloadversion_types_test.go @@ -113,12 +113,15 @@ func TestKeptnWorkloadVersion(t *testing.T) { require.False(t, workload.IsEndTimeSet()) require.False(t, workload.IsStartTimeSet()) + require.False(t, workload.IsDeploymentStartTimeSet()) workload.SetStartTime() workload.SetEndTime() + workload.SetDeploymentStartTime() require.True(t, workload.IsEndTimeSet()) require.True(t, workload.IsStartTimeSet()) + require.True(t, workload.IsDeploymentStartTimeSet()) require.Equal(t, []attribute.KeyValue{ common.AppName.String("appname"), diff --git a/lifecycle-operator/controllers/lifecycle/keptnworkloadversion/controller_test.go b/lifecycle-operator/controllers/lifecycle/keptnworkloadversion/controller_test.go index fcb37bf0d34..9aa76f10e1a 100644 --- a/lifecycle-operator/controllers/lifecycle/keptnworkloadversion/controller_test.go +++ b/lifecycle-operator/controllers/lifecycle/keptnworkloadversion/controller_test.go @@ -53,6 +53,7 @@ func TestKeptnWorkloadVersionReconciler_reconcileDeployment_FailedReplicaSet(t * keptnState, err := r.reconcileDeployment(context.TODO(), workloadVersion) require.Nil(t, err) require.Equal(t, apicommon.StateProgressing, keptnState) + require.False(t, workloadVersion.Status.DeploymentStartTime.IsZero()) } func TestKeptnWorkloadVersionReconciler_reconcileDeployment_UnavailableReplicaSet(t *testing.T) { @@ -71,6 +72,51 @@ func TestKeptnWorkloadVersionReconciler_reconcileDeployment_UnavailableReplicaSe keptnState, err := r.reconcileDeployment(context.TODO(), workloadVersion) require.NotNil(t, err) require.Equal(t, apicommon.StateUnknown, keptnState) + require.True(t, workloadVersion.Status.DeploymentStartTime.IsZero()) +} + +func TestKeptnWorkloadVersionReconciler_reconcileDeployment_WorkloadDeploymentTimedOut(t *testing.T) { + + rep := int32(1) + replicaset := makeReplicaSet("myrep", "default", &rep, 0) + workloadVersion := makeWorkloadVersionWithRef(replicaset.ObjectMeta, "ReplicaSet") + + fakeClient := testcommon.NewTestClient(replicaset, workloadVersion) + + fakeRecorder := record.NewFakeRecorder(100) + + r := &KeptnWorkloadVersionReconciler{ + Client: fakeClient, + Config: config.Instance(), + EventSender: eventsender.NewK8sSender(fakeRecorder), + } + + r.Config.SetObservabilityTimeout(metav1.Duration{ + Duration: 5 * time.Second, + }) + + keptnState, err := r.reconcileDeployment(context.TODO(), workloadVersion) + require.Nil(t, err) + require.Equal(t, apicommon.StateProgressing, keptnState) + require.False(t, workloadVersion.Status.DeploymentStartTime.IsZero()) + + //revert the start time parameter backwards to check the timer + workloadVersion.Status.DeploymentStartTime = metav1.Time{ + Time: workloadVersion.Status.DeploymentStartTime.Add(-10 * time.Second), + } + + err = r.Client.Status().Update(context.TODO(), workloadVersion) + require.Nil(t, err) + + keptnState, err = r.reconcileDeployment(context.TODO(), workloadVersion) + require.Nil(t, err) + require.Equal(t, apicommon.StateFailed, keptnState) + require.False(t, workloadVersion.Status.DeploymentStartTime.IsZero()) + + event := <-fakeRecorder.Events + require.Equal(t, strings.Contains(event, workloadVersion.GetName()), true, "wrong workloadVersion") + require.Equal(t, strings.Contains(event, workloadVersion.GetNamespace()), true, "wrong namespace") + require.Equal(t, strings.Contains(event, "has reached timeout"), true, "wrong message") } func TestKeptnWorkloadVersionReconciler_reconcileDeployment_FailedStatefulSet(t *testing.T) { @@ -87,6 +133,7 @@ func TestKeptnWorkloadVersionReconciler_reconcileDeployment_FailedStatefulSet(t keptnState, err := r.reconcileDeployment(context.TODO(), workloadVersion) require.Nil(t, err) require.Equal(t, apicommon.StateProgressing, keptnState) + require.False(t, workloadVersion.Status.DeploymentStartTime.IsZero()) } func TestKeptnWorkloadVersionReconciler_reconcileDeployment_UnavailableStatefulSet(t *testing.T) { @@ -105,6 +152,7 @@ func TestKeptnWorkloadVersionReconciler_reconcileDeployment_UnavailableStatefulS keptnState, err := r.reconcileDeployment(context.TODO(), workloadVersion) require.NotNil(t, err) require.Equal(t, apicommon.StateUnknown, keptnState) + require.True(t, workloadVersion.Status.DeploymentStartTime.IsZero()) } func TestKeptnWorkloadVersionReconciler_reconcileDeployment_FailedDaemonSet(t *testing.T) { @@ -121,6 +169,7 @@ func TestKeptnWorkloadVersionReconciler_reconcileDeployment_FailedDaemonSet(t *t keptnState, err := r.reconcileDeployment(context.TODO(), workloadVersion) require.Nil(t, err) require.Equal(t, apicommon.StateProgressing, keptnState) + require.False(t, workloadVersion.Status.DeploymentStartTime.IsZero()) } func TestKeptnWorkloadVersionReconciler_reconcileDeployment_UnavailableDaemonSet(t *testing.T) { @@ -137,6 +186,7 @@ func TestKeptnWorkloadVersionReconciler_reconcileDeployment_UnavailableDaemonSet keptnState, err := r.reconcileDeployment(context.TODO(), workloadVersion) require.NotNil(t, err) require.Equal(t, apicommon.StateUnknown, keptnState) + require.True(t, workloadVersion.Status.DeploymentStartTime.IsZero()) } func TestKeptnWorkloadVersionReconciler_reconcileDeployment_ReadyReplicaSet(t *testing.T) { @@ -154,6 +204,7 @@ func TestKeptnWorkloadVersionReconciler_reconcileDeployment_ReadyReplicaSet(t *t keptnState, err := r.reconcileDeployment(context.TODO(), workloadVersion) require.Nil(t, err) require.Equal(t, apicommon.StateSucceeded, keptnState) + require.False(t, workloadVersion.Status.DeploymentStartTime.IsZero()) } func TestKeptnWorkloadVersionReconciler_reconcileDeployment_ReadyStatefulSet(t *testing.T) { @@ -171,6 +222,7 @@ func TestKeptnWorkloadVersionReconciler_reconcileDeployment_ReadyStatefulSet(t * keptnState, err := r.reconcileDeployment(context.TODO(), workloadVersion) require.Nil(t, err) require.Equal(t, apicommon.StateSucceeded, keptnState) + require.False(t, workloadVersion.Status.DeploymentStartTime.IsZero()) } func TestKeptnWorkloadVersionReconciler_reconcileDeployment_ReadyDaemonSet(t *testing.T) { @@ -187,6 +239,7 @@ func TestKeptnWorkloadVersionReconciler_reconcileDeployment_ReadyDaemonSet(t *te keptnState, err := r.reconcileDeployment(context.TODO(), workloadVersion) require.Nil(t, err) require.Equal(t, apicommon.StateSucceeded, keptnState) + require.False(t, workloadVersion.Status.DeploymentStartTime.IsZero()) } func TestKeptnWorkloadVersionReconciler_reconcileDeployment_UnsupportedReferenceKind(t *testing.T) { @@ -200,6 +253,7 @@ func TestKeptnWorkloadVersionReconciler_reconcileDeployment_UnsupportedReference keptnState, err := r.reconcileDeployment(context.TODO(), workloadVersion) require.ErrorIs(t, err, controllererrors.ErrUnsupportedWorkloadVersionResourceReference) require.Equal(t, apicommon.StateUnknown, keptnState) + require.True(t, workloadVersion.Status.DeploymentStartTime.IsZero()) } func makeReplicaSet(name string, namespace string, wanted *int32, available int32) *appsv1.ReplicaSet { diff --git a/lifecycle-operator/controllers/lifecycle/keptnworkloadversion/reconcile_deploymentstate.go b/lifecycle-operator/controllers/lifecycle/keptnworkloadversion/reconcile_deploymentstate.go index 20d132d379b..3067bdfab34 100644 --- a/lifecycle-operator/controllers/lifecycle/keptnworkloadversion/reconcile_deploymentstate.go +++ b/lifecycle-operator/controllers/lifecycle/keptnworkloadversion/reconcile_deploymentstate.go @@ -2,6 +2,7 @@ package keptnworkloadversion import ( "context" + "time" argov1alpha1 "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" klcv1beta1 "github.com/keptn/lifecycle-toolkit/lifecycle-operator/apis/lifecycle/v1beta1" @@ -15,6 +16,16 @@ func (r *KeptnWorkloadVersionReconciler) reconcileDeployment(ctx context.Context var isRunning bool var err error + if r.isDeploymentTimedOut(workloadVersion) { + workloadVersion.Status.DeploymentStatus = apicommon.StateFailed + err = r.Client.Status().Update(ctx, workloadVersion) + if err != nil { + return apicommon.StateUnknown, err + } + r.EventSender.Emit(apicommon.PhaseWorkloadDeployment, "Warning", workloadVersion, apicommon.PhaseStateFinished, "has reached timeout", workloadVersion.GetVersion()) + return workloadVersion.Status.DeploymentStatus, nil + } + switch workloadVersion.Spec.ResourceReference.Kind { case "ReplicaSet": isRunning, err = r.isReplicaSetRunning(ctx, workloadVersion.Spec.ResourceReference, workloadVersion.Namespace) @@ -29,10 +40,14 @@ func (r *KeptnWorkloadVersionReconciler) reconcileDeployment(ctx context.Context if err != nil { return apicommon.StateUnknown, err } + + if !workloadVersion.IsDeploymentStartTimeSet() { + workloadVersion.SetDeploymentStartTime() + workloadVersion.Status.DeploymentStatus = apicommon.StateProgressing + } + if isRunning { workloadVersion.Status.DeploymentStatus = apicommon.StateSucceeded - } else { - workloadVersion.Status.DeploymentStatus = apicommon.StateProgressing } err = r.Client.Status().Update(ctx, workloadVersion) @@ -42,6 +57,16 @@ func (r *KeptnWorkloadVersionReconciler) reconcileDeployment(ctx context.Context return workloadVersion.Status.DeploymentStatus, nil } +func (r *KeptnWorkloadVersionReconciler) isDeploymentTimedOut(workloadVersion *klcv1beta1.KeptnWorkloadVersion) bool { + if !workloadVersion.IsDeploymentStartTimeSet() { + return false + } + + deploymentDeadline := workloadVersion.Status.DeploymentStartTime.Add(r.Config.GetObservabilityTimeout().Duration) + currentTime := time.Now().UTC() + return currentTime.After(deploymentDeadline) +} + func (r *KeptnWorkloadVersionReconciler) isReplicaSetRunning(ctx context.Context, resource klcv1beta1.ResourceReference, namespace string) (bool, error) { rep := appsv1.ReplicaSet{} err := r.Client.Get(ctx, types.NamespacedName{Name: resource.Name, Namespace: namespace}, &rep) diff --git a/test/chainsaw/non-blocking-deployment/verify-keptnconfig.sh b/test/chainsaw/common/verify-keptnconfig.sh similarity index 100% rename from test/chainsaw/non-blocking-deployment/verify-keptnconfig.sh rename to test/chainsaw/common/verify-keptnconfig.sh diff --git a/test/chainsaw/non-blocking-deployment/chainsaw-test.yaml b/test/chainsaw/non-blocking-deployment/chainsaw-test.yaml index 0131b6de825..748173a8d01 100755 --- a/test/chainsaw/non-blocking-deployment/chainsaw-test.yaml +++ b/test/chainsaw/non-blocking-deployment/chainsaw-test.yaml @@ -16,7 +16,7 @@ spec: - name: step-01 try: - script: - content: ./verify-keptnconfig.sh + content: ./../common/verify-keptnconfig.sh - sleep: duration: 30s - name: step-02 @@ -32,7 +32,7 @@ spec: - name: step-04 try: - script: - content: ./verify-keptnconfig.sh + content: ./../common/verify-keptnconfig.sh - sleep: duration: 30s - name: step-05 diff --git a/test/chainsaw/timeout-failure-deployment/00-assert.yaml b/test/chainsaw/timeout-failure-deployment/00-assert.yaml new file mode 100644 index 00000000000..71fb343e2d3 --- /dev/null +++ b/test/chainsaw/timeout-failure-deployment/00-assert.yaml @@ -0,0 +1,50 @@ +apiVersion: lifecycle.keptn.sh/v1beta1 +kind: KeptnAppVersion +metadata: + name: podtato-head-0.1.0-6b86b273 +spec: + appName: podtato-head + revision: 1 + version: 0.1.0 + workloads: + - name: podtato-head-entry + version: 0.1.0 +status: + currentPhase: AppDeploy + postDeploymentEvaluationStatus: Deprecated + postDeploymentStatus: Deprecated + preDeploymentEvaluationStatus: Succeeded + preDeploymentStatus: Succeeded + promotionStatus: Deprecated + status: Failed + workloadOverallStatus: Failed +--- +apiVersion: lifecycle.keptn.sh/v1beta1 +kind: KeptnWorkloadVersion +metadata: + generation: 1 + name: podtato-head-podtato-head-entry-0.1.0 +spec: + app: podtato-head + version: 0.1.0 + workloadName: podtato-head-podtato-head-entry +status: + currentPhase: WorkloadDeploy + deploymentStatus: Failed + postDeploymentEvaluationStatus: Deprecated + postDeploymentStatus: Deprecated + preDeploymentEvaluationStatus: Succeeded + preDeploymentStatus: Succeeded + status: Failed +--- +apiVersion: v1 +kind: Pod +metadata: + annotations: + keptn.sh/app: podtato-head + keptn.sh/version: 0.1.0 + keptn.sh/workload: podtato-head-entry + labels: + component: podtato-head-entry +status: + phase: Pending diff --git a/test/chainsaw/timeout-failure-deployment/00-install.yaml b/test/chainsaw/timeout-failure-deployment/00-install.yaml new file mode 100644 index 00000000000..87b99616d6a --- /dev/null +++ b/test/chainsaw/timeout-failure-deployment/00-install.yaml @@ -0,0 +1,28 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: podtato-head-entry + labels: + app: podtato-head +spec: + selector: + matchLabels: + component: podtato-head-entry + template: + metadata: + labels: + component: podtato-head-entry + annotations: + keptn.sh/app: podtato-head + keptn.sh/workload: podtato-head-entry + keptn.sh/version: 0.1.0 + spec: + containers: + - name: server + image: ghcr.io/podtato-head/entry:non-existing + imagePullPolicy: Always + ports: + - containerPort: 9000 + env: + - name: PODTATO_PORT + value: "9000" diff --git a/test/chainsaw/timeout-failure-deployment/chainsaw-test.yaml b/test/chainsaw/timeout-failure-deployment/chainsaw-test.yaml new file mode 100755 index 00000000000..e28aaf85dd4 --- /dev/null +++ b/test/chainsaw/timeout-failure-deployment/chainsaw-test.yaml @@ -0,0 +1,31 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: timeout-failure-deployment +spec: + namespaceTemplate: + metadata: + annotations: + keptn.sh/lifecycle-toolkit: enabled + steps: + - name: step-00 + try: + - apply: + file: keptnconfig.yaml + - name: step-01 + try: + - script: + content: ./../common/verify-keptnconfig.sh + - sleep: + duration: 30s + - name: step-02 + try: + - apply: + file: 00-install.yaml + - assert: + file: 00-assert.yaml + - name: step-03 + try: + - apply: + file: keptnconfig-default.yaml diff --git a/test/chainsaw/timeout-failure-deployment/keptnconfig-default.yaml b/test/chainsaw/timeout-failure-deployment/keptnconfig-default.yaml new file mode 100644 index 00000000000..b815e77ae5b --- /dev/null +++ b/test/chainsaw/timeout-failure-deployment/keptnconfig-default.yaml @@ -0,0 +1,6 @@ +apiVersion: options.keptn.sh/v1alpha1 +kind: KeptnConfig +metadata: + name: non-blocking-config +spec: + observabilityTimeout: 5m diff --git a/test/chainsaw/timeout-failure-deployment/keptnconfig.yaml b/test/chainsaw/timeout-failure-deployment/keptnconfig.yaml new file mode 100644 index 00000000000..686f188ab60 --- /dev/null +++ b/test/chainsaw/timeout-failure-deployment/keptnconfig.yaml @@ -0,0 +1,6 @@ +apiVersion: options.keptn.sh/v1alpha1 +kind: KeptnConfig +metadata: + name: non-blocking-config +spec: + observabilityTimeout: 10s