diff --git a/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs b/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs index fffcdcbc..687a7670 100644 --- a/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs +++ b/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs @@ -115,9 +115,10 @@ public class NetheriteOrchestrationServiceSettings /// /// Whether to checkpoint the current state of a partition when it is stopped. This improves recovery time but - /// lengthens shutdown time. + /// lengthens shutdown time and can cause memory pressure if many partitions are stopped at the same time, + /// for example if a host is shutting down. /// - public bool TakeStateCheckpointWhenStoppingPartition { get; set; } = true; + public bool TakeStateCheckpointWhenStoppingPartition { get; set; } = false; /// /// A limit on how many bytes to append to the log before initiating a state checkpoint. The default is 20MB. diff --git a/src/DurableTask.Netherite/StorageLayer/Faster/AzureBlobs/BlobManager.cs b/src/DurableTask.Netherite/StorageLayer/Faster/AzureBlobs/BlobManager.cs index e26089e3..4b634f7d 100644 --- a/src/DurableTask.Netherite/StorageLayer/Faster/AzureBlobs/BlobManager.cs +++ b/src/DurableTask.Netherite/StorageLayer/Faster/AzureBlobs/BlobManager.cs @@ -677,6 +677,8 @@ public async Task RenewLeaseTask() public async Task MaintenanceLoopAsync() { + bool releaseLeaseAtEnd = !this.UseLocalFiles; + this.TraceHelper.LeaseProgress("Started lease maintenance loop"); try { @@ -711,6 +713,7 @@ public async Task MaintenanceLoopAsync() { // We lost the lease to someone else. Terminate ownership immediately. this.PartitionErrorHandler.HandleError(nameof(MaintenanceLoopAsync), "Lost partition lease", ex, true, true); + releaseLeaseAtEnd = false; } catch (Exception e) { @@ -729,24 +732,22 @@ public async Task MaintenanceLoopAsync() this.TraceHelper.LeaseProgress("Waited for lease users to complete"); // release the lease - if (!this.UseLocalFiles) + if (releaseLeaseAtEnd) { try { this.TraceHelper.LeaseProgress("Releasing lease"); this.FaultInjector?.StorageAccess(this, "ReleaseLeaseAsync", "ReleaseLease", this.eventLogCommitBlob.Name); - await this.leaseClient.ReleaseAsync(null, this.PartitionErrorHandler.Token).ConfigureAwait(false); + + // we must always release the lease here, whether the partition has already been terminated or not. + // otherwise, the recovery of this partition (on this host or another host) has to wait for up + // to 40s for the lease to expire. During this time, the partition is stalled (cannot process any work + // items or client requests). In particular, client requests targeting this partition may be heavily delayed. + await this.leaseClient.ReleaseAsync(conditions: null, CancellationToken.None).ConfigureAwait(false); + this.TraceHelper.LeaseReleased(this.leaseTimer.Elapsed.TotalSeconds); } - catch (OperationCanceledException) - { - // it's o.k. if termination is triggered while waiting - } - catch (Azure.RequestFailedException e) when (e.InnerException != null && e.InnerException is OperationCanceledException) - { - // it's o.k. if termination is triggered while we are releasing the lease - } catch (Exception e) { // we swallow, but still report exceptions when releasing a lease diff --git a/src/DurableTask.Netherite/Util/BlobUtils.cs b/src/DurableTask.Netherite/Util/BlobUtils.cs index 6cc94e72..d181d97e 100644 --- a/src/DurableTask.Netherite/Util/BlobUtils.cs +++ b/src/DurableTask.Netherite/Util/BlobUtils.cs @@ -100,6 +100,12 @@ public static bool IsTransientStorageError(Exception exception) return true; } + // Empirically observed: transient DNS failures + if (exception is Azure.RequestFailedException && exception.InnerException is System.Net.Http.HttpRequestException e2 && e2.Message.Contains("No such host is known")) + { + return true; + } + return false; }