diff --git a/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs b/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs
index fffcdcbc..687a7670 100644
--- a/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs
+++ b/src/DurableTask.Netherite/OrchestrationService/NetheriteOrchestrationServiceSettings.cs
@@ -115,9 +115,10 @@ public class NetheriteOrchestrationServiceSettings
///
/// Whether to checkpoint the current state of a partition when it is stopped. This improves recovery time but
- /// lengthens shutdown time.
+ /// lengthens shutdown time and can cause memory pressure if many partitions are stopped at the same time,
+ /// for example if a host is shutting down.
///
- public bool TakeStateCheckpointWhenStoppingPartition { get; set; } = true;
+ public bool TakeStateCheckpointWhenStoppingPartition { get; set; } = false;
///
/// A limit on how many bytes to append to the log before initiating a state checkpoint. The default is 20MB.
diff --git a/src/DurableTask.Netherite/StorageLayer/Faster/AzureBlobs/BlobManager.cs b/src/DurableTask.Netherite/StorageLayer/Faster/AzureBlobs/BlobManager.cs
index e26089e3..4b634f7d 100644
--- a/src/DurableTask.Netherite/StorageLayer/Faster/AzureBlobs/BlobManager.cs
+++ b/src/DurableTask.Netherite/StorageLayer/Faster/AzureBlobs/BlobManager.cs
@@ -677,6 +677,8 @@ public async Task RenewLeaseTask()
public async Task MaintenanceLoopAsync()
{
+ bool releaseLeaseAtEnd = !this.UseLocalFiles;
+
this.TraceHelper.LeaseProgress("Started lease maintenance loop");
try
{
@@ -711,6 +713,7 @@ public async Task MaintenanceLoopAsync()
{
// We lost the lease to someone else. Terminate ownership immediately.
this.PartitionErrorHandler.HandleError(nameof(MaintenanceLoopAsync), "Lost partition lease", ex, true, true);
+ releaseLeaseAtEnd = false;
}
catch (Exception e)
{
@@ -729,24 +732,22 @@ public async Task MaintenanceLoopAsync()
this.TraceHelper.LeaseProgress("Waited for lease users to complete");
// release the lease
- if (!this.UseLocalFiles)
+ if (releaseLeaseAtEnd)
{
try
{
this.TraceHelper.LeaseProgress("Releasing lease");
this.FaultInjector?.StorageAccess(this, "ReleaseLeaseAsync", "ReleaseLease", this.eventLogCommitBlob.Name);
- await this.leaseClient.ReleaseAsync(null, this.PartitionErrorHandler.Token).ConfigureAwait(false);
+
+ // we must always release the lease here, whether the partition has already been terminated or not.
+ // otherwise, the recovery of this partition (on this host or another host) has to wait for up
+ // to 40s for the lease to expire. During this time, the partition is stalled (cannot process any work
+ // items or client requests). In particular, client requests targeting this partition may be heavily delayed.
+ await this.leaseClient.ReleaseAsync(conditions: null, CancellationToken.None).ConfigureAwait(false);
+
this.TraceHelper.LeaseReleased(this.leaseTimer.Elapsed.TotalSeconds);
}
- catch (OperationCanceledException)
- {
- // it's o.k. if termination is triggered while waiting
- }
- catch (Azure.RequestFailedException e) when (e.InnerException != null && e.InnerException is OperationCanceledException)
- {
- // it's o.k. if termination is triggered while we are releasing the lease
- }
catch (Exception e)
{
// we swallow, but still report exceptions when releasing a lease
diff --git a/src/DurableTask.Netherite/Util/BlobUtils.cs b/src/DurableTask.Netherite/Util/BlobUtils.cs
index 6cc94e72..d181d97e 100644
--- a/src/DurableTask.Netherite/Util/BlobUtils.cs
+++ b/src/DurableTask.Netherite/Util/BlobUtils.cs
@@ -100,6 +100,12 @@ public static bool IsTransientStorageError(Exception exception)
return true;
}
+ // Empirically observed: transient DNS failures
+ if (exception is Azure.RequestFailedException && exception.InnerException is System.Net.Http.HttpRequestException e2 && e2.Message.Contains("No such host is known"))
+ {
+ return true;
+ }
+
return false;
}