Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Always release partition lease when a partition shuts down #335

Merged
merged 5 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,10 @@ public class NetheriteOrchestrationServiceSettings

/// <summary>
/// Whether to checkpoint the current state of a partition when it is stopped. This improves recovery time but
/// lengthens shutdown time.
/// lengthens shutdown time and can cause memory pressure if many partitions are stopped at the same time,
/// for example if a host is shutting down.
/// </summary>
public bool TakeStateCheckpointWhenStoppingPartition { get; set; } = true;
public bool TakeStateCheckpointWhenStoppingPartition { get; set; } = false;

/// <summary>
/// A limit on how many bytes to append to the log before initiating a state checkpoint. The default is 20MB.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,8 @@ public async Task RenewLeaseTask()

public async Task MaintenanceLoopAsync()
{
bool releaseLeaseAtEnd = !this.UseLocalFiles;

this.TraceHelper.LeaseProgress("Started lease maintenance loop");
try
{
Expand Down Expand Up @@ -711,6 +713,7 @@ public async Task MaintenanceLoopAsync()
{
// We lost the lease to someone else. Terminate ownership immediately.
this.PartitionErrorHandler.HandleError(nameof(MaintenanceLoopAsync), "Lost partition lease", ex, true, true);
releaseLeaseAtEnd = false;
}
catch (Exception e)
{
Expand All @@ -729,24 +732,22 @@ public async Task MaintenanceLoopAsync()
this.TraceHelper.LeaseProgress("Waited for lease users to complete");

// release the lease
if (!this.UseLocalFiles)
if (releaseLeaseAtEnd)
{
try
{
this.TraceHelper.LeaseProgress("Releasing lease");

this.FaultInjector?.StorageAccess(this, "ReleaseLeaseAsync", "ReleaseLease", this.eventLogCommitBlob.Name);
await this.leaseClient.ReleaseAsync(null, this.PartitionErrorHandler.Token).ConfigureAwait(false);

// we must always release the lease here, whether the partition has already been terminated or not.
// otherwise, the recovery of this partition (on this host or another host) has to wait for up
// to 40s for the lease to expire. During this time, the partition is stalled (cannot process any work
// items or client requests). In particular, client requests targeting this partition may be heavily delayed.
await this.leaseClient.ReleaseAsync(conditions: null, CancellationToken.None).ConfigureAwait(false);

this.TraceHelper.LeaseReleased(this.leaseTimer.Elapsed.TotalSeconds);
}
catch (OperationCanceledException)
{
// it's o.k. if termination is triggered while waiting
}
catch (Azure.RequestFailedException e) when (e.InnerException != null && e.InnerException is OperationCanceledException)
{
// it's o.k. if termination is triggered while we are releasing the lease
}
catch (Exception e)
{
// we swallow, but still report exceptions when releasing a lease
Expand Down
6 changes: 6 additions & 0 deletions src/DurableTask.Netherite/Util/BlobUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ public static bool IsTransientStorageError(Exception exception)
return true;
}

// Empirically observed: transient DNS failures
if (exception is Azure.RequestFailedException && exception.InnerException is System.Net.Http.HttpRequestException e2 && e2.Message.Contains("No such host is known"))
{
return true;
}

return false;
}

Expand Down