From cf11c76c509daf7e3fd9e2b6ee11375a6dd8bd14 Mon Sep 17 00:00:00 2001 From: Sebastian Burckhardt Date: Mon, 14 Jun 2021 12:56:30 -0700 Subject: [PATCH] fix replay bug in activity scheduler, and add configuration setting --- .../ActivitySchedulerOptions.cs | 25 +++++++++++++++++++ .../NetheriteOrchestrationServiceSettings.cs | 8 +++++- .../PartitionState/ActivitiesState.cs | 25 +++++++++++-------- test/PerformanceTests/host.json | 4 +++ .../series/host.neth-12-ls.json | 7 +++--- 5 files changed, 54 insertions(+), 15 deletions(-) create mode 100644 src/DurableTask.Netherite/OrchestrationService/ActivitySchedulerOptions.cs diff --git a/src/DurableTask.Netherite/OrchestrationService/ActivitySchedulerOptions.cs b/src/DurableTask.Netherite/OrchestrationService/ActivitySchedulerOptions.cs new file mode 100644 index 00000000..6270e4c1 --- /dev/null +++ b/src/DurableTask.Netherite/OrchestrationService/ActivitySchedulerOptions.cs @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace DurableTask.Netherite +{ + using System; + using System.Collections.Generic; + using System.Text; + + /// + /// Settings for how activities . + /// + public enum ActivitySchedulerOptions + { + /// + /// All activities are scheduled on the same partition as the orchestration. + /// + Local, + + /// + /// Activities are scheduled locally if possible, but backlog is offloaded periodically. + /// + PeriodicOffload, + } +} diff --git a/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs b/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs index 25f854c7..93f0b092 100644 --- a/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs +++ b/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs @@ -90,6 +90,12 @@ public class NetheriteOrchestrationServiceSettings [JsonConverter(typeof(StringEnumConverter))] public PartitionManagementOptions PartitionManagement { get; set; } = PartitionManagementOptions.EventProcessorHost; + /// + /// Gets or sets the activity scheduler option + /// + [JsonConverter(typeof(StringEnumConverter))] + public ActivitySchedulerOptions ActivityScheduler { get; set; } = ActivitySchedulerOptions.PeriodicOffload; + /// /// Gets or sets a flag indicating whether to enable caching of execution cursors to avoid replay. /// @@ -150,7 +156,7 @@ public class NetheriteOrchestrationServiceSettings public bool UseAlternateObjectStore { get; set; } = false; /// - /// Forces steps to pe persisted before applying their effects, thus disabling all speculation. + /// Forces steps to pe persisted before applying their effects, disabling all pipelining. /// public bool PersistStepsFirst { get; set; } = false; diff --git a/src/DurableTask.Netherite/PartitionState/ActivitiesState.cs b/src/DurableTask.Netherite/PartitionState/ActivitiesState.cs index fd179243..50cf2fde 100644 --- a/src/DurableTask.Netherite/PartitionState/ActivitiesState.cs +++ b/src/DurableTask.Netherite/PartitionState/ActivitiesState.cs @@ -116,11 +116,14 @@ public override string ToString() void ScheduleNextOffloadDecision(TimeSpan delay) { - this.Partition.PendingTimers.Schedule(DateTime.UtcNow + delay, new OffloadDecision() + if (this.Partition.Settings.ActivityScheduler != ActivitySchedulerOptions.Local) { - PartitionId = this.Partition.PartitionId, - Timestamp = DateTime.UtcNow + delay, - }); + this.Partition.PendingTimers.Schedule(DateTime.UtcNow + delay, new OffloadDecision() + { + PartitionId = this.Partition.PartitionId, + Timestamp = DateTime.UtcNow + delay, + }); + } } public bool TryGetNextActivity(out ActivityInfo activityInfo) @@ -292,7 +295,7 @@ public void Process(OffloadDecision offloadDecisionEvent, EffectTracker effects) // we are adding (nonpersisted) information to the event just as a way of passing it to the OutboxState offloadDecisionEvent.DestinationPartitionId = target; - offloadDecisionEvent.OffloadedActivities = new List<(TaskMessage,string)>(); + offloadDecisionEvent.OffloadedActivities = new List<(TaskMessage, string)>(); for (int i = 0; i < maxBatchsize; i++) { @@ -314,10 +317,10 @@ public void Process(OffloadDecision offloadDecisionEvent, EffectTracker effects) if (!effects.IsReplaying) { this.Partition.EventTraceHelper.TracePartitionOffloadDecision(this.EstimatedLocalWorkItemLoad, this.Pending.Count, this.LocalBacklog.Count, this.QueuedRemotes.Count, reportedRemotes); - } - // try again relatively soon - this.ScheduleNextOffloadDecision(TimeSpan.FromMilliseconds(200)); + // try again relatively soon + this.ScheduleNextOffloadDecision(TimeSpan.FromMilliseconds(200)); + } } else { @@ -326,10 +329,10 @@ public void Process(OffloadDecision offloadDecisionEvent, EffectTracker effects) if (!effects.IsReplaying) { this.Partition.EventTraceHelper.TracePartitionOffloadDecision(this.EstimatedLocalWorkItemLoad, this.Pending.Count, this.LocalBacklog.Count, this.QueuedRemotes.Count, reportedRemotes); - } - // there are no eligible recipients... try again in a while - this.ScheduleNextOffloadDecision(TimeSpan.FromSeconds(10)); + // there are no eligible recipients... try again in a while + this.ScheduleNextOffloadDecision(TimeSpan.FromSeconds(10)); + } } } diff --git a/test/PerformanceTests/host.json b/test/PerformanceTests/host.json index 3c33fab5..28366882 100644 --- a/test/PerformanceTests/host.json +++ b/test/PerformanceTests/host.json @@ -74,8 +74,12 @@ "PersistStepsFirst": false, // set this to "Scripted" to control the scenario with a partition script + // or to "ClientOnly" to run only the client "PartitionManagement": "EventProcessorHost", + // set this to "Local" to disable the global activity distribution algorithm + "ActivityScheduler": "PeriodicOffload", + // The log level limits below control the production of log events by the various components. // it limits production, not just consumption, of the events, so it can be used to prevent overheads. // "Debug" is a reasonable setting, as it allows troubleshooting without impacting perf too much. diff --git a/test/PerformanceTests/series/host.neth-12-ls.json b/test/PerformanceTests/series/host.neth-12-ls.json index 43f76eac..368603da 100644 --- a/test/PerformanceTests/series/host.neth-12-ls.json +++ b/test/PerformanceTests/series/host.neth-12-ls.json @@ -68,11 +68,12 @@ "PersistStepsFirst": false, // set this to "Scripted" to control the scenario with a partition script + // or to "ClientOnly" to run only the client "PartitionManagement": "EventProcessorHost", - // set this to false to disable activity offloading - "DistributeActivities": false, - + // set this to "Local" to disable the global activity distribution algorithm + "ActivityScheduler": "PeriodicOffload", + // The log level limits below control the production of log events by the various components. // it limits production, not just consumption, of the events, so it can be used to prevent overheads. // "Debug" is a reasonable setting, as it allows troubleshooting without impacting perf too much.