Skip to content

Commit

Permalink
Always release partition lease when a partition shuts down (#335)
Browse files Browse the repository at this point in the history
* always release lease when a partition shuts down

* address PR feedback (add comment)

* recognize another empiric case of transient storage exception

* change default of partition checkpointing

* do not release lease if lease was already lost.
  • Loading branch information
sebastianburckhardt authored Jan 3, 2024
1 parent fc777fe commit b47eae7
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,10 @@ public class NetheriteOrchestrationServiceSettings

/// <summary>
/// Whether to checkpoint the current state of a partition when it is stopped. This improves recovery time but
/// lengthens shutdown time.
/// lengthens shutdown time and can cause memory pressure if many partitions are stopped at the same time,
/// for example if a host is shutting down.
/// </summary>
public bool TakeStateCheckpointWhenStoppingPartition { get; set; } = true;
public bool TakeStateCheckpointWhenStoppingPartition { get; set; } = false;

/// <summary>
/// A limit on how many bytes to append to the log before initiating a state checkpoint. The default is 20MB.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,8 @@ public async Task RenewLeaseTask()

public async Task MaintenanceLoopAsync()
{
bool releaseLeaseAtEnd = !this.UseLocalFiles;

this.TraceHelper.LeaseProgress("Started lease maintenance loop");
try
{
Expand Down Expand Up @@ -711,6 +713,7 @@ public async Task MaintenanceLoopAsync()
{
// We lost the lease to someone else. Terminate ownership immediately.
this.PartitionErrorHandler.HandleError(nameof(MaintenanceLoopAsync), "Lost partition lease", ex, true, true);
releaseLeaseAtEnd = false;
}
catch (Exception e)
{
Expand All @@ -729,24 +732,22 @@ public async Task MaintenanceLoopAsync()
this.TraceHelper.LeaseProgress("Waited for lease users to complete");

// release the lease
if (!this.UseLocalFiles)
if (releaseLeaseAtEnd)
{
try
{
this.TraceHelper.LeaseProgress("Releasing lease");

this.FaultInjector?.StorageAccess(this, "ReleaseLeaseAsync", "ReleaseLease", this.eventLogCommitBlob.Name);
await this.leaseClient.ReleaseAsync(null, this.PartitionErrorHandler.Token).ConfigureAwait(false);

// we must always release the lease here, whether the partition has already been terminated or not.
// otherwise, the recovery of this partition (on this host or another host) has to wait for up
// to 40s for the lease to expire. During this time, the partition is stalled (cannot process any work
// items or client requests). In particular, client requests targeting this partition may be heavily delayed.
await this.leaseClient.ReleaseAsync(conditions: null, CancellationToken.None).ConfigureAwait(false);

this.TraceHelper.LeaseReleased(this.leaseTimer.Elapsed.TotalSeconds);
}
catch (OperationCanceledException)
{
// it's o.k. if termination is triggered while waiting
}
catch (Azure.RequestFailedException e) when (e.InnerException != null && e.InnerException is OperationCanceledException)
{
// it's o.k. if termination is triggered while we are releasing the lease
}
catch (Exception e)
{
// we swallow, but still report exceptions when releasing a lease
Expand Down
6 changes: 6 additions & 0 deletions src/DurableTask.Netherite/Util/BlobUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ public static bool IsTransientStorageError(Exception exception)
return true;
}

// Empirically observed: transient DNS failures
if (exception is Azure.RequestFailedException && exception.InnerException is System.Net.Http.HttpRequestException e2 && e2.Message.Contains("No such host is known"))
{
return true;
}

return false;
}

Expand Down

0 comments on commit b47eae7

Please sign in to comment.