cadence-workflow · taylanisikdemir · Oct 19, 2023 · Oct 25, 2023 · Oct 25, 2023 · Oct 25, 2023
@@ -508,6 +508,7 @@ func (e *NonDeterministicError) Error() string {
 	case "mismatch":
 		// historical text
 		return "nondeterministic workflow: " +
+			"mismatching history event and replay decision found. " +
 			"history event is " + e.HistoryEventText + ", " +
 			"replay decision is " + e.DecisionText
 	default:

@@ -432,6 +432,10 @@ func deSerializeFunctionResult(f interface{}, result []byte, to interface{}, dat
 	}
 
 	// For everything we return result.
+	// Code reaches here for 2 cases:
+	// 	1. activity is executed by name (not the func pointer) and it wasn't registered
+	// 	2. activity is executed by func pointer and the signature indicates it doesn't/can't return data.
+	//		for example it only has one return parameter (which can only be be error).
 	return decodeArg(dataConverter, result, to)
 }
 

@@ -262,6 +262,28 @@ func isDecisionEvent(eventType s.EventType) bool {
 	}
 }
 
+// isDecisionEventForReplay is different from isDecisionEvent because during replays
+// we want to intentionally ignore workflow complete/fail/cancel/continueasnew events so that
+// decision tree replays matches with the workflow processing respond tasks
+func isDecisionEventForReplay(eventType s.EventType) bool {
+	switch eventType {
+	case
+		s.EventTypeActivityTaskScheduled,
+		s.EventTypeActivityTaskCancelRequested,
+		s.EventTypeTimerStarted,
+		s.EventTypeTimerCanceled,
+		s.EventTypeCancelTimerFailed,
+		s.EventTypeMarkerRecorded,
+		s.EventTypeStartChildWorkflowExecutionInitiated,
+		s.EventTypeRequestCancelExternalWorkflowExecutionInitiated,
+		s.EventTypeSignalExternalWorkflowExecutionInitiated,
+		s.EventTypeUpsertWorkflowSearchAttributes:
+		return true
+	default:
+		return false
+	}
+}
+
 // NextDecisionEvents returns events that there processed as new by the next decision.
 // TODO(maxim): Refactor to return a struct instead of multiple parameters
 func (eh *history) NextDecisionEvents() (result []*s.HistoryEvent, markers []*s.HistoryEvent, binaryChecksum *string, err error) {
@@ -840,6 +862,19 @@ process_Workflow_Loop:
 	return response, err
 }
 
+// ProcessWorkflowTask processes the given workflow which includes
+// - fetching, reordering and replaying historical decision events. (Decision events in this context is an umbrella term for workflow relevant events)
+// - state machine is incrementally built with every decision.
+// - state machine makes sure that when a workflow restarts for some reason same activities (or timers etc.) are not called again and previous result state is loaded into memory
+//
+// Note about Replay tests mode:
+//
+//	This mode works by replaying the historical decision events responses (as defined in isDecisionEventForReplay())
+//	and comparing these with the replays gotten from state machine
+//
+//	Compared to isDecisionEvent(), isDecisionEventForReplay() omits the following events even though they are workflow relevant respond events:
+//		complete/failed/cancel/continueasnew
+//	The reason is that state machine doesn't have a correspondong decision for these so they cause false positive non-determinism errors in Replay tests.
 func (w *workflowExecutionContextImpl) ProcessWorkflowTask(workflowTask *workflowTask) (interface{}, error) {
 	task := workflowTask.task
 	historyIterator := workflowTask.historyIterator
@@ -899,8 +934,16 @@ ProcessEvents:
 		for i, event := range reorderedEvents {
 			isInReplay := reorderedHistory.IsReplayEvent(event)
 			isLast := !isInReplay && i == len(reorderedEvents)-1
-			if !skipReplayCheck && isDecisionEvent(event.GetEventType()) {
-				respondEvents = append(respondEvents, event)
+			if !skipReplayCheck {
+				isDecisionEventFn := isDecisionEvent
+				// when strict nondeterminism is enabled we use a different function to check for decision events during replay
+				if !w.wth.disableStrictNonDeterminism && isInReplay {
+					isDecisionEventFn = isDecisionEventForReplay
+				}
+
+				if isDecisionEventFn(event.GetEventType()) {
+					respondEvents = append(respondEvents, event)
+				}
 			}
 
 			if isPreloadMarkerEvent(event) {
@@ -918,7 +961,16 @@ ProcessEvents:
 			if err != nil {
 				return nil, err
 			}
-			if w.isWorkflowCompleted {
+
+			// Break the event processing loop if either
+			//  - Workflow is completed AND strict nondeterminism checks disabled.
+			//  - Workflow is completed AND strict nondeterminism checks enabled AND NOT in replay mode.
+			// 		With strict nondeterminism checks enabled, breaking the loop early causes missing events
+			// 		in respondEvents which then causes false positives or false negatives.
+			stopProcessing := (w.isWorkflowCompleted && w.wth.disableStrictNonDeterminism) ||
+				(w.isWorkflowCompleted && !w.wth.disableStrictNonDeterminism && !isInReplay)
+
+			if stopProcessing {
 				break ProcessEvents
 			}
 		}
@@ -936,6 +988,9 @@ ProcessEvents:
 			}
 		}
 		isReplay := len(reorderedEvents) > 0 && reorderedHistory.IsReplayEvent(reorderedEvents[len(reorderedEvents)-1])
+		// incomplete decisions (e.g. start without a complete) at the end of history will still have decisions in decisionsHelper
+		// but there won't be corresponding respond events. This breaks the non-determinism check therefore we ignore such final partial decisions.
+		// Example scenario is covered by TestReplayWorkflowHistory_Partial_NoDecisionEvents
 		lastDecisionEventsForReplayTest := isReplayTest && !reorderedHistory.HasNextDecisionEvents()
 		if isReplay && !lastDecisionEventsForReplayTest {
 			eventDecisions := eventHandler.decisionsHelper.getDecisions(true)

@@ -880,9 +880,9 @@ func (t *TaskHandlersTestSuite) TestWorkflowTask_NondeterministicLogNonexistingI
 	require.NotNil(t.T(), replayErrorField)
 	require.Equal(t.T(), zapcore.ErrorType, replayErrorField.Type)
 	require.ErrorContains(t.T(), replayErrorField.Interface.(error),
-		"nondeterministic workflow: "+
+		"nondeterministic workflow: mismatching history event and replay decision found. "+
 			"history event is ActivityTaskScheduled: (ActivityId:NotAnActivityID, ActivityType:(Name:pkg.Greeter_Activity), TaskList:(Name:taskList), Input:[]), "+
-			"replay decision is ScheduleActivityTask: (ActivityId:0, ActivityType:(Name:Greeter_Activity), TaskList:(Name:taskList)")
+			"replay decision is ScheduleActivityTask: (ActivityId:0, ActivityType:(Name:Greeter_Activity), TaskList:(Name:taskList), Input:[], ScheduleToCloseTimeoutSeconds:120, ScheduleToStartTimeoutSeconds:60, StartToCloseTimeoutSeconds:60, HeartbeatTimeoutSeconds:20, Header:(Fields:map[]))")
 }
 
 func (t *TaskHandlersTestSuite) TestWorkflowTask_WorkflowReturnsPanicError() {

@@ -99,6 +99,23 @@ func (s *workflowReplayerSuite) TestReplayWorkflowHistory_Partial_WithDecisionEv
 	s.NoError(err)
 }
 
+// This test case covers partial decision scenario where a decision is started but not closed
+// History:
+//
+//	1: WorkflowExecutionStarted
+//	2: DecisionTaskScheduled
+//	3: DecisionTaskStarted
+//	4: DecisionTaskFailed
+//	5: DecisionTaskScheduled
+//	6: DecisionTaskStarted
+//
+// Notes on task handling logic during replay:
+//
+//	reorderedHistory.NextDecisionEvents() ignores events 2, 3, 4 because it failed.
+//	it only returns 1 and 6 to be replayed.
+//	6 changes the state in decisionsHelper (generates a decision) however there's no corresponding
+//	respond due to missing close event (failed/complete etc.)
+//	Such partial decisions at the end of the history is ignored during replay tests to avoid non-determinism error
 func (s *workflowReplayerSuite) TestReplayWorkflowHistory_Partial_NoDecisionEvents() {
 	err := s.replayer.ReplayWorkflowHistory(s.logger, getTestReplayWorkflowPartialHistoryNoDecisionEvents(s.T()))
 	s.NoError(err)

@@ -72,7 +72,7 @@ func sampleBranchWorkflow2(ctx workflow.Context) error {
 	}
 	ctx = workflow.WithActivityOptions(ctx, ao)
 
-	for i := 1; i <= 4; i++ {
+	for i := 1; i <= 2; i++ {
 		activityInput := fmt.Sprintf("branch %d of 4", i)
 		future := workflow.ExecuteActivity(ctx, sampleActivity, activityInput)
 		futures = append(futures, future)

@@ -176,7 +176,34 @@
   },
   {
     "eventId": 12,
-    "timestamp": 1679427717321911295,
+    "timestamp": 1679427717321780254,
+    "eventType": "ActivityTaskStarted",
+    "version": 0,
+    "taskId": 5243011,
+    "activityTaskStartedEventAttributes": {
+      "scheduledEventId": 11,
+      "identity": "82203@agautam-NV709R969P@choiceGroup@41d230ae-253a-4d01-9079-322ef05c09fb",
+      "requestId": "ae2aad96-6588-4359-807b-a39a16f0896a",
+      "attempt": 0,
+      "lastFailureReason": ""
+    }
+  },
+  {
+    "eventId": 13,
+    "timestamp": 1679427717321780255,
+    "eventType": "ActivityTaskCompleted",
+    "version": 0,
+    "taskId": 5243000,
+    "activityTaskCompletedEventAttributes": {
+      "result": "ImJhbmFuYSIK",
+      "scheduledEventId": 11,
+      "startedEventId": 12,
+      "identity": "82203@agautam-NV709R969P@choiceGroup@41d230ae-253a-4d01-9079-322ef05c09fb"
+    }
+  },
+  {
+    "eventId": 14,
+    "timestamp": 1679427717321780256,
     "eventType": "WorkflowExecutionCompleted",
     "version": 0,
     "taskId": 5243011,

@@ -0,0 +1,90 @@
+[
+    {
+        "eventId": 1,
+        "timestamp": 1699856700704442400,
+        "eventType": "WorkflowExecutionStarted",
+        "version": 4,
+        "taskId": 882931375,
+        "workflowExecutionStartedEventAttributes": {
+            "workflowType": {
+                "name": "fx.SimpleSignalWorkflow"
+            },
+            "taskList": {
+                "name": "fx-worker"
+            },
+            "executionStartToCloseTimeoutSeconds": 600,
+            "taskStartToCloseTimeoutSeconds": 10,
+            "continuedExecutionRunId": "a664f402-bfe9-4739-945c-9cbc637548f1",
+            "initiator": "CronSchedule",
+            "continuedFailureReason": "cadenceInternal:Timeout START_TO_CLOSE",
+            "originalExecutionRunId": "d0baf930-6a83-4740-b773-71aaa696eed1",
+            "firstExecutionRunId": "e85fa1b9-8899-40ce-8af9-7e0f93ed7ae5",
+            "firstScheduleTimeNano": "2023-05-22T15:45:26.535595761-07:00",
+            "cronSchedule": "* * * * *",
+            "firstDecisionTaskBackoffSeconds": 60,
+            "PartitionConfig": {
+                "isolation-group": "dca11"
+            }
+        }
+    },
+    {
+        "eventId": 2,
+        "timestamp": 1699856760713586608,
+        "eventType": "DecisionTaskScheduled",
+        "version": 4,
+        "taskId": 882931383,
+        "decisionTaskScheduledEventAttributes": {
+            "taskList": {
+                "name": "fx-worker"
+            },
+            "startToCloseTimeoutSeconds": 10
+        }
+    },
+    {
+        "eventId": 3,
+        "timestamp": 1699856760741837021,
+        "eventType": "DecisionTaskStarted",
+        "version": 4,
+        "taskId": 882931387,
+        "decisionTaskStartedEventAttributes": {
+            "scheduledEventId": 2,
+            "identity": "202@dca50-7q@fx-worker@db443597-5124-483a-b1a5-4b1ff35a0ed4",
+            "requestId": "bb0ee926-13d1-4af4-9f9c-51433333ad04"
+        }
+    },
+    {
+        "eventId": 4,
+        "timestamp": 1699856760773459755,
+        "eventType": "DecisionTaskCompleted",
+        "version": 4,
+        "taskId": 882931391,
+        "decisionTaskCompletedEventAttributes": {
+            "scheduledEventId": 2,
+            "startedEventId": 3,
+            "identity": "202@dca50-7q@fx-worker@db443597-5124-483a-b1a5-4b1ff35a0ed4",
+            "binaryChecksum": "uDeploy:dc3e318b30a49e8bb88f462a50fe3a01dd210a3a"
+        }
+    },
+    {
+        "eventId": 5,
+        "timestamp": 1699857360713649962,
+        "eventType": "WorkflowExecutionContinuedAsNew",
+        "version": 4,
+        "taskId": 882931394,
+        "workflowExecutionContinuedAsNewEventAttributes": {
+            "newExecutionRunId": "06c2468c-2d2d-44f7-ac7a-ff3c383f6e90",
+            "workflowType": {
+                "name": "fx.SimpleSignalWorkflow"
+            },
+            "taskList": {
+                "name": "fx-worker"
+            },
+            "executionStartToCloseTimeoutSeconds": 600,
+            "taskStartToCloseTimeoutSeconds": 10,
+            "decisionTaskCompletedEventId": -23,
+            "backoffStartIntervalInSeconds": 60,
+            "initiator": "CronSchedule",
+            "failureReason": "cadenceInternal:Timeout START_TO_CLOSE"
+        }
+    }
+]
@@ -0,0 +1,26 @@
+package replaytests
+
+import (
+	"go.uber.org/cadence/workflow"
+	"go.uber.org/zap"
+)
+
+// ContinueAsNewWorkflow is a sample Cadence workflows that can receive a signal
+func ContinueAsNewWorkflow(ctx workflow.Context) error {
+	selector := workflow.NewSelector(ctx)
+	var signalResult string
+	signalName := "helloWorldSignal"
+	for {
+		signalChan := workflow.GetSignalChannel(ctx, signalName)
+		selector.AddReceive(signalChan, func(c workflow.Channel, more bool) {
+			c.Receive(ctx, &signalResult)
+			workflow.GetLogger(ctx).Info("Received age signalResult from signal!", zap.String("signal", signalName), zap.String("value", signalResult))
+		})
+		workflow.GetLogger(ctx).Info("Waiting for signal on channel.. " + signalName)
+		// Wait for signal
+		selector.Select(ctx)
+		if signalResult == "kill" {
+			return nil
+		}
+	}
+}
@@ -39,7 +39,7 @@ const (
 	orderChoiceCherry = "cherry"
 )
 
-// exclusiveChoiceWorkflow Workflow Decider. This workflow executes Cherry order.
+// exclusiveChoiceWorkflow executes main.getOrderActivity and executes either cherry or banana activity depends on what main.getOrderActivity returns
 func exclusiveChoiceWorkflow(ctx workflow.Context) error {
 	// Get order.
 	ao := workflow.ActivityOptions{
@@ -50,7 +50,7 @@ func exclusiveChoiceWorkflow(ctx workflow.Context) error {
 	ctx = workflow.WithActivityOptions(ctx, ao)
 
 	var orderChoice string
-	err := workflow.ExecuteActivity(ctx, getOrderActivity).Get(ctx, &orderChoice)
+	err := workflow.ExecuteActivity(ctx, "main.getOrderActivity").Get(ctx, &orderChoice)
 	if err != nil {
 		return err
 	}
@@ -60,9 +60,9 @@ func exclusiveChoiceWorkflow(ctx workflow.Context) error {
 	// choose next activity based on order result
 	switch orderChoice {
 	case orderChoiceBanana:
-		workflow.ExecuteActivity(ctx, orderBananaActivity, orderChoice)
+		workflow.ExecuteActivity(ctx, "main.orderBananaActivity", orderChoice)
 	case orderChoiceCherry:
-		workflow.ExecuteActivity(ctx, orderCherryActivity, orderChoice)
+		workflow.ExecuteActivity(ctx, "main.orderCherryActivity", orderChoice)
 	default:
 		logger.Error("Unexpected order", zap.String("Choice", orderChoice))
 	}
@@ -71,8 +71,8 @@ func exclusiveChoiceWorkflow(ctx workflow.Context) error {
 	return nil
 }
 
-// This workflow explicitly executes Apple Activity received from the getorderActivity.
-func exclusiveChoiceWorkflow2(ctx workflow.Context) error {
+// exclusiveChoiceWorkflow executes main.getOrderActivity and executes either cherry or banana activity depends on what main.getOrderActivity returns
+func exclusiveChoiceWorkflowAlwaysCherry(ctx workflow.Context) error {
 	// Get order.
 	ao := workflow.ActivityOptions{
 		ScheduleToStartTimeout: time.Minute,
@@ -82,40 +82,25 @@ func exclusiveChoiceWorkflow2(ctx workflow.Context) error {
 	ctx = workflow.WithActivityOptions(ctx, ao)
 
 	var orderChoice string
-	err := workflow.ExecuteActivity(ctx, getAppleOrderActivity).Get(ctx, &orderChoice)
+	err := workflow.ExecuteActivity(ctx, "main.getOrderActivity").Get(ctx, &orderChoice)
 	if err != nil {
 		return err
 	}
 
 	logger := workflow.GetLogger(ctx)
+	logger.Sugar().Infof("Got order for %s but will ignore and order cherry!!", orderChoice)
 
-	// choose next activity based on order result. It's apple in this case.
-	switch orderChoice {
-	case orderChoiceApple:
-		workflow.ExecuteActivity(ctx, orderAppleActivity, orderChoice)
-	default:
-		logger.Error("Unexpected order", zap.String("Choice", orderChoice))
-	}
+	workflow.ExecuteActivity(ctx, "main.orderCherryActivity", orderChoice)
 
 	logger.Info("Workflow completed.")
 	return nil
 }
 
-func getOrderActivity() (string, error) {
-	fmt.Printf("Order is for Cherry")
-	return "cherry", nil
-}
-
-func getAppleOrderActivity() (string, error) {
+func getBananaOrderActivity() (string, error) {
 	fmt.Printf("Order is for Apple")
 	return "apple", nil
 }
 
-func orderAppleActivity(choice string) error {
-	fmt.Printf("Order choice: %v\n", choice)
-	return nil
-}
-
 func orderBananaActivity(choice string) error {
 	fmt.Printf("Order choice: %v\n", choice)
 	return nil