From 145d6bb7c8275f83957f8fd6629c2fab5f653675 Mon Sep 17 00:00:00 2001 From: Giovanni Liva Date: Mon, 24 Oct 2022 15:01:46 +0200 Subject: [PATCH] Add WLI tracing Signed-off-by: Giovanni Liva --- .../v1alpha1/keptnworkloadinstance_types.go | 5 +- .../keptnworkloadinstance/controller.go | 81 ++++++++++++++++--- 2 files changed, 72 insertions(+), 14 deletions(-) diff --git a/operator/api/v1alpha1/keptnworkloadinstance_types.go b/operator/api/v1alpha1/keptnworkloadinstance_types.go index d619aea7319..c0b33acafea 100644 --- a/operator/api/v1alpha1/keptnworkloadinstance_types.go +++ b/operator/api/v1alpha1/keptnworkloadinstance_types.go @@ -30,8 +30,9 @@ import ( // KeptnWorkloadInstanceSpec defines the desired state of KeptnWorkloadInstance type KeptnWorkloadInstanceSpec struct { KeptnWorkloadSpec `json:",inline"` - WorkloadName string `json:"workloadName"` - PreviousVersion string `json:"previousVersion,omitempty"` + WorkloadName string `json:"workloadName"` + PreviousVersion string `json:"previousVersion,omitempty"` + TraceId map[string]string `json:"traceId,omitempty"` } // KeptnWorkloadInstanceStatus defines the observed state of KeptnWorkloadInstance diff --git a/operator/controllers/keptnworkloadinstance/controller.go b/operator/controllers/keptnworkloadinstance/controller.go index 813032550ec..4453614a8ce 100644 --- a/operator/controllers/keptnworkloadinstance/controller.go +++ b/operator/controllers/keptnworkloadinstance/controller.go @@ -47,11 +47,12 @@ import ( // KeptnWorkloadInstanceReconciler reconciles a KeptnWorkloadInstance object type KeptnWorkloadInstanceReconciler struct { client.Client - Scheme *runtime.Scheme - Recorder record.EventRecorder - Log logr.Logger - Meters common.KeptnMeters - Tracer trace.Tracer + Scheme *runtime.Scheme + Recorder record.EventRecorder + Log logr.Logger + Meters common.KeptnMeters + Tracer trace.Tracer + bindCRDSpan map[string]trace.Span } //+kubebuilder:rbac:groups=lifecycle.keptn.sh,resources=keptnworkloadinstances,verbs=get;list;watch;create;update;patch;delete @@ -113,6 +114,9 @@ func (r *KeptnWorkloadInstanceReconciler) Reconcile(ctx context.Context, req ctr return reconcile.Result{Requeue: true, RequeueAfter: 10 * time.Second}, fmt.Errorf("could not find AppVersion for KeptnWorkloadInstance") } + appTraceContextCarrier := propagation.MapCarrier(appVersion.Spec.TraceId) + ctxAppTrace := otel.GetTextMapPropagator().Extract(context.TODO(), appTraceContextCarrier) + appPreEvalStatus := appVersion.Status.PreDeploymentEvaluationStatus if !appPreEvalStatus.IsSucceeded() { if appPreEvalStatus.IsFailed() { @@ -125,15 +129,28 @@ func (r *KeptnWorkloadInstanceReconciler) Reconcile(ctx context.Context, req ctr //Wait for pre-deployment checks of Workload phase = common.PhaseWorkloadPreDeployment - + saveState := false //Set state to progressing if not already set if workloadInstance.Status.PreDeploymentStatus == common.StatePending { workloadInstance.Status.PreDeploymentStatus = common.StateProgressing + saveState = true + } + // set the App trace id if not already set + if len(workloadInstance.Spec.TraceId) < 1 { + workloadInstance.Spec.TraceId = appVersion.Spec.TraceId + saveState = true + } + if saveState { if err := r.Status().Update(ctx, workloadInstance); err != nil { return ctrl.Result{}, err } } if appVersion.Status.CurrentPhase == "" { + r.unbindSpan(workloadInstance, phase.ShortName) + var spanAppTrace trace.Span + ctxAppTrace, spanAppTrace = r.getSpan(ctxAppTrace, workloadInstance, phase.ShortName) + semconv.AddAttributeFromAppVersion(spanAppTrace, appVersion) + spanAppTrace.AddEvent("WorkloadInstance Pre-Deployment Tasks started", trace.WithTimestamp(time.Now())) r.recordEvent(phase, "Normal", workloadInstance, "Started", "have started") } @@ -141,7 +158,7 @@ func (r *KeptnWorkloadInstanceReconciler) Reconcile(ctx context.Context, req ctr reconcilePre := func() (common.KeptnState, error) { return r.reconcilePrePostDeployment(ctx, workloadInstance, common.PreDeploymentCheckType) } - return r.handlePhase(ctx, workloadInstance, phase, span, workloadInstance.IsPreDeploymentFailed, reconcilePre) + return r.handlePhase(ctx, ctxAppTrace, workloadInstance, phase, span, workloadInstance.IsPreDeploymentFailed, reconcilePre) } //Wait for pre-evaluation checks of Workload @@ -158,7 +175,7 @@ func (r *KeptnWorkloadInstanceReconciler) Reconcile(ctx context.Context, req ctr reconcilePreEval := func() (common.KeptnState, error) { return r.reconcilePrePostEvaluation(ctx, workloadInstance, common.PreDeploymentEvaluationCheckType) } - return r.handlePhase(ctx, workloadInstance, phase, span, workloadInstance.IsPreDeploymentEvaluationFailed, reconcilePreEval) + return r.handlePhase(ctx, ctxAppTrace, workloadInstance, phase, span, workloadInstance.IsPreDeploymentEvaluationFailed, reconcilePreEval) } //Wait for deployment of Workload @@ -174,7 +191,7 @@ func (r *KeptnWorkloadInstanceReconciler) Reconcile(ctx context.Context, req ctr reconcileWorkloadInstance := func() (common.KeptnState, error) { return r.reconcileDeployment(ctx, workloadInstance) } - return r.handlePhase(ctx, workloadInstance, phase, span, workloadInstance.IsDeploymentFailed, reconcileWorkloadInstance) + return r.handlePhase(ctx, ctxAppTrace, workloadInstance, phase, span, workloadInstance.IsDeploymentFailed, reconcileWorkloadInstance) } //Wait for post-deployment checks of Workload @@ -190,7 +207,7 @@ func (r *KeptnWorkloadInstanceReconciler) Reconcile(ctx context.Context, req ctr reconcilePostDeployment := func() (common.KeptnState, error) { return r.reconcilePrePostDeployment(ctx, workloadInstance, common.PostDeploymentCheckType) } - return r.handlePhase(ctx, workloadInstance, phase, span, workloadInstance.IsPostDeploymentFailed, reconcilePostDeployment) + return r.handlePhase(ctx, ctxAppTrace, workloadInstance, phase, span, workloadInstance.IsPostDeploymentFailed, reconcilePostDeployment) } //Wait for post-evaluation checks of Workload @@ -206,7 +223,7 @@ func (r *KeptnWorkloadInstanceReconciler) Reconcile(ctx context.Context, req ctr reconcilePostEval := func() (common.KeptnState, error) { return r.reconcilePrePostEvaluation(ctx, workloadInstance, common.PostDeploymentEvaluationCheckType) } - return r.handlePhase(ctx, workloadInstance, phase, span, workloadInstance.IsPostDeploymentEvaluationFailed, reconcilePostEval) + return r.handlePhase(ctx, ctxAppTrace, workloadInstance, phase, span, workloadInstance.IsPostDeploymentEvaluationFailed, reconcilePostEval) } // WorkloadInstance is completed at this place @@ -260,23 +277,31 @@ func (r *KeptnWorkloadInstanceReconciler) GetActiveDeployments(ctx context.Conte return res, nil } -func (r *KeptnWorkloadInstanceReconciler) handlePhase(ctx context.Context, workloadInstance *klcv1alpha1.KeptnWorkloadInstance, phase common.KeptnPhaseType, span trace.Span, phaseFailed func() bool, reconcilePhase func() (common.KeptnState, error)) (ctrl.Result, error) { +func (r *KeptnWorkloadInstanceReconciler) handlePhase(ctx context.Context, ctxAppTrace context.Context, workloadInstance *klcv1alpha1.KeptnWorkloadInstance, phase common.KeptnPhaseType, span trace.Span, phaseFailed func() bool, reconcilePhase func() (common.KeptnState, error)) (ctrl.Result, error) { r.Log.Info(phase.LongName + " not finished") overallStateUpdated := false oldstate := workloadInstance.Status.Status oldPhase := workloadInstance.Status.CurrentPhase workloadInstance.Status.CurrentPhase = phase.ShortName + + ctxAppTrace, spanAppTrace := r.getSpan(ctxAppTrace, workloadInstance, phase.ShortName) + if phaseFailed() { //TODO eventually we should decide whether a task returns FAILED, currently we never have this status set r.recordEvent(phase, "Warning", workloadInstance, "Failed", "has failed") return ctrl.Result{Requeue: true, RequeueAfter: 60 * time.Second}, nil } state, err := reconcilePhase() if err != nil { + spanAppTrace.AddEvent(phase.LongName + " could not get reconciled") r.recordEvent(phase, "Warning", workloadInstance, "ReconcileErrored", "could not get reconciled") span.SetStatus(codes.Error, err.Error()) return ctrl.Result{Requeue: true}, err } if state.IsSucceeded() { + spanAppTrace.AddEvent(phase.LongName + " has succeeded") + spanAppTrace.SetStatus(codes.Ok, "Succeeded") + spanAppTrace.End() + r.unbindSpan(workloadInstance, phase.ShortName) r.recordEvent(phase, "Normal", workloadInstance, "Succeeded", "has succeeded") } else if state.IsFailed() { r.recordEvent(phase, "Warning", workloadInstance, "Failed", "has failed") @@ -286,15 +311,23 @@ func (r *KeptnWorkloadInstanceReconciler) handlePhase(ctx context.Context, workl attrs := workloadInstance.GetMetricsAttributes() r.Meters.DeploymentCount.Add(ctx, 1, attrs...) + spanAppTrace.AddEvent(phase.LongName + " has failed") + spanAppTrace.SetStatus(codes.Error, "Failed") + spanAppTrace.End() + r.unbindSpan(workloadInstance, phase.ShortName) + overallStateUpdated = true } else { if oldstate != common.StateProgressing { workloadInstance.Status.Status = common.StateProgressing overallStateUpdated = true } + spanAppTrace.AddEvent(phase.LongName + " not finished") r.recordEvent(phase, "Warning", workloadInstance, "NotFinished", "has not finished") } if oldPhase != workloadInstance.Status.CurrentPhase { + ctxAppTrace, spanAppTrace = r.getSpan(ctxAppTrace, workloadInstance, workloadInstance.Status.CurrentPhase) + semconv.AddAttributeFromWorkloadInstance(spanAppTrace, *workloadInstance) overallStateUpdated = true } @@ -368,3 +401,27 @@ func (r *KeptnWorkloadInstanceReconciler) getAppVersionForWorkloadInstance(ctx c } return true, latestVersion, nil } + +func (r *KeptnWorkloadInstanceReconciler) getSpan(ctx context.Context, wli *klcv1alpha1.KeptnWorkloadInstance, phase string) (context.Context, trace.Span) { + wliName := r.getSpanName(wli, phase) + spanName := fmt.Sprintf("%s/%s", wli.Spec.WorkloadName, phase) + + if r.bindCRDSpan == nil { + r.bindCRDSpan = make(map[string]trace.Span) + } + if span, ok := r.bindCRDSpan[wliName]; ok { + return ctx, span + } + r.Log.Info("DEBUG: Start Span: " + wliName) + ctx, span := r.Tracer.Start(ctx, spanName, trace.WithSpanKind(trace.SpanKindConsumer)) + r.bindCRDSpan[wliName] = span + return ctx, span +} + +func (r *KeptnWorkloadInstanceReconciler) unbindSpan(wli *klcv1alpha1.KeptnWorkloadInstance, phase string) { + delete(r.bindCRDSpan, r.getSpanName(wli, phase)) +} + +func (r *KeptnWorkloadInstanceReconciler) getSpanName(wli *klcv1alpha1.KeptnWorkloadInstance, phase string) string { + return fmt.Sprintf("%s.%s.%s.%s.%s", wli.Spec.TraceId, wli.Spec.AppName, wli.Spec.WorkloadName, wli.Spec.Version, phase) +}