Skip to content

Commit

Permalink
[Internal] ClientRetryPolicy: Fixes Partition Failover on Next region…
Browse files Browse the repository at this point in the history
… when RG Fails with `HttpRequestException` (#4565)

* Code changes to fix some of the flakey tests.

* Code changes to fix client retry policy.

* Code changes to add tests to validate the scenario.
  • Loading branch information
kundadebdatta authored Jul 16, 2024
1 parent 6e1d40d commit 016e19f
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 5 deletions.
8 changes: 8 additions & 0 deletions Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

if (this.isPertitionLevelFailoverEnabled)
{
// In the event of the routing gateway having outage on region A, mark the partition as unavailable assuming that the
// partition has been failed over to region B, when per partition automatic failover is enabled.
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest);
}

// Mark both read and write requests because it gateway exception.
// This means all requests going to the region will fail.
return await this.ShouldRetryOnEndpointFailureAsync(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ public override bool TryMarkEndpointUnavailableForPartitionKeyRange(

}

private sealed class PartitionKeyRangeFailoverInfo
internal sealed class PartitionKeyRangeFailoverInfo
{
// HashSet is not thread safe and should only accessed in the lock
private readonly HashSet<Uri> FailedLocations;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@
using System.Threading.Tasks;
using Microsoft.Azure.Documents.Collections;
using Microsoft.Azure.Documents.Client;
using Microsoft.Azure.Cosmos.Common;

using Microsoft.Azure.Cosmos.Common;
using System.Net.Http;
using System.Reflection;
using System.Collections.Concurrent;

/// <summary>
/// Tests for <see cref="ClientRetryPolicy"/>
/// </summary>
Expand Down Expand Up @@ -122,6 +125,72 @@ public void Http503SubStatusHandelingTests(int testCode)
Task<ShouldRetryResult> retryStatus = retryPolicy.ShouldRetryAsync(documentClientException, cancellationToken);

Assert.IsFalse(retryStatus.Result.ShouldRetry);
}

/// <summary>
/// Tests to validate that when HttpRequestException is thrown while connecting to a gateway endpoint for a single master write account with PPAF enabled,
/// a partition level failover is added and the request is retried to the next region.
/// </summary>
[TestMethod]
[DataRow(true, DisplayName = "Case when partition level failover is enabled.")]
[DataRow(false, DisplayName = "Case when partition level failover is disabled.")]

public void HttpRequestExceptionHandelingTests(
bool enablePartitionLevelFailover)
{
const bool enableEndpointDiscovery = true;
const string suffix = "-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF";

//Creates a sample write request
DocumentServiceRequest request = this.CreateRequest(false, false);
request.RequestContext.ResolvedPartitionKeyRange = new PartitionKeyRange() { Id = "0" , MinInclusive = "3F" + suffix, MaxExclusive = "5F" + suffix };

//Create GlobalEndpointManager
using GlobalEndpointManager endpointManager = this.Initialize(
useMultipleWriteLocations: false,
enableEndpointDiscovery: enableEndpointDiscovery,
isPreferredLocationsListEmpty: false,
enablePartitionLevelFailover: enablePartitionLevelFailover);

// Capture the read locations.
ReadOnlyCollection<Uri> readLocations = endpointManager.ReadEndpoints;

//Create Retry Policy
ClientRetryPolicy retryPolicy = new (
globalEndpointManager: endpointManager,
partitionKeyRangeLocationCache: this.partitionKeyRangeLocationCache,
retryOptions: new RetryOptions(),
enableEndpointDiscovery: enableEndpointDiscovery,
isPertitionLevelFailoverEnabled: enablePartitionLevelFailover);

CancellationToken cancellationToken = new ();
HttpRequestException httpRequestException = new (message: "Connecting to endpoint has failed.");

GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection(
this.partitionKeyRangeLocationCache,
request.RequestContext.ResolvedPartitionKeyRange);

// Validate that the partition key range failover info is not present before the http request exception was captured in the retry policy.
Assert.IsNull(partitionKeyRangeFailoverInfo);

retryPolicy.OnBeforeSendRequest(request);
Task<ShouldRetryResult> retryStatus = retryPolicy.ShouldRetryAsync(httpRequestException, cancellationToken);

Assert.IsTrue(retryStatus.Result.ShouldRetry);

partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection(
this.partitionKeyRangeLocationCache,
request.RequestContext.ResolvedPartitionKeyRange);

if (enablePartitionLevelFailover)
{
// Validate that the partition key range failover info to the next account region is present after the http request exception was captured in the retry policy.
Assert.AreEqual(partitionKeyRangeFailoverInfo.Current, readLocations[1]);
}
else
{
Assert.IsNull(partitionKeyRangeFailoverInfo);
}
}

[TestMethod]
Expand Down Expand Up @@ -170,7 +239,7 @@ public Task ClientRetryPolicy_NoRetry_MultiMaster_Read_NoPreferredLocations()
public Task ClientRetryPolicy_NoRetry_MultiMaster_Write_NoPreferredLocations()
{
return this.ValidateConnectTimeoutTriggersClientRetryPolicy(isReadRequest: false, useMultipleWriteLocations: true, usesPreferredLocations: false, false);
}
}

private async Task ValidateConnectTimeoutTriggersClientRetryPolicy(
bool isReadRequest,
Expand Down Expand Up @@ -284,7 +353,29 @@ await BackoffRetryUtility<StoreResponse>.ExecuteAsync(
}
}
}
}
}

private static GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo GetPartitionKeyRangeFailoverInfoUsingReflection(
GlobalPartitionEndpointManager globalPartitionEndpointManager,
PartitionKeyRange pkRange)
{
FieldInfo fieldInfo = globalPartitionEndpointManager
.GetType()
.GetField(
name: "PartitionKeyRangeToLocation",
bindingAttr: BindingFlags.Instance | BindingFlags.NonPublic);

if (fieldInfo != null)
{
Lazy<ConcurrentDictionary<PartitionKeyRange, GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo>> partitionKeyRangeToLocation = (Lazy<ConcurrentDictionary<PartitionKeyRange, GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo>>)fieldInfo.GetValue(globalPartitionEndpointManager);
partitionKeyRangeToLocation.Value.TryGetValue(pkRange, out GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo partitionKeyRangeFailoverInfo);

return partitionKeyRangeFailoverInfo;
}

return null;
}

private static AccountProperties CreateDatabaseAccount(
bool useMultipleWriteLocations,
bool enforceSingleMasterSingleWriteLocation)
Expand Down

0 comments on commit 016e19f

Please sign in to comment.