From 016e19f6de02742c87a75e4d95e816b6d277034e Mon Sep 17 00:00:00 2001 From: Debdatta Kunda <87335885+kundadebdatta@users.noreply.github.com> Date: Tue, 16 Jul 2024 09:45:39 -0700 Subject: [PATCH] [Internal] ClientRetryPolicy: Fixes Partition Failover on Next region when RG Fails with `HttpRequestException` (#4565) * Code changes to fix some of the flakey tests. * Code changes to fix client retry policy. * Code changes to add tests to validate the scenario. --- .../src/ClientRetryPolicy.cs | 8 ++ .../GlobalPartitionEndpointManagerCore.cs | 2 +- .../ClientRetryPolicyTests.cs | 99 ++++++++++++++++++- 3 files changed, 104 insertions(+), 5 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index e19433c17e..3137a6f042 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -78,6 +78,14 @@ public async Task ShouldRetryAsync( this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty, this.documentServiceRequest?.ResourceAddress ?? string.Empty); + if (this.isPertitionLevelFailoverEnabled) + { + // In the event of the routing gateway having outage on region A, mark the partition as unavailable assuming that the + // partition has been failed over to region B, when per partition automatic failover is enabled. + this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( + this.documentServiceRequest); + } + // Mark both read and write requests because it gateway exception. // This means all requests going to the region will fail. return await this.ShouldRetryOnEndpointFailureAsync( diff --git a/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs b/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs index fe2e256601..945e84e8fa 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs @@ -168,7 +168,7 @@ public override bool TryMarkEndpointUnavailableForPartitionKeyRange( } - private sealed class PartitionKeyRangeFailoverInfo + internal sealed class PartitionKeyRangeFailoverInfo { // HashSet is not thread safe and should only accessed in the lock private readonly HashSet FailedLocations; diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs index cf1c739388..d4c56bae2e 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs @@ -14,8 +14,11 @@ using System.Threading.Tasks; using Microsoft.Azure.Documents.Collections; using Microsoft.Azure.Documents.Client; - using Microsoft.Azure.Cosmos.Common; - + using Microsoft.Azure.Cosmos.Common; + using System.Net.Http; + using System.Reflection; + using System.Collections.Concurrent; + /// /// Tests for /// @@ -122,6 +125,72 @@ public void Http503SubStatusHandelingTests(int testCode) Task retryStatus = retryPolicy.ShouldRetryAsync(documentClientException, cancellationToken); Assert.IsFalse(retryStatus.Result.ShouldRetry); + } + + /// + /// Tests to validate that when HttpRequestException is thrown while connecting to a gateway endpoint for a single master write account with PPAF enabled, + /// a partition level failover is added and the request is retried to the next region. + /// + [TestMethod] + [DataRow(true, DisplayName = "Case when partition level failover is enabled.")] + [DataRow(false, DisplayName = "Case when partition level failover is disabled.")] + + public void HttpRequestExceptionHandelingTests( + bool enablePartitionLevelFailover) + { + const bool enableEndpointDiscovery = true; + const string suffix = "-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF"; + + //Creates a sample write request + DocumentServiceRequest request = this.CreateRequest(false, false); + request.RequestContext.ResolvedPartitionKeyRange = new PartitionKeyRange() { Id = "0" , MinInclusive = "3F" + suffix, MaxExclusive = "5F" + suffix }; + + //Create GlobalEndpointManager + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: false, + enableEndpointDiscovery: enableEndpointDiscovery, + isPreferredLocationsListEmpty: false, + enablePartitionLevelFailover: enablePartitionLevelFailover); + + // Capture the read locations. + ReadOnlyCollection readLocations = endpointManager.ReadEndpoints; + + //Create Retry Policy + ClientRetryPolicy retryPolicy = new ( + globalEndpointManager: endpointManager, + partitionKeyRangeLocationCache: this.partitionKeyRangeLocationCache, + retryOptions: new RetryOptions(), + enableEndpointDiscovery: enableEndpointDiscovery, + isPertitionLevelFailoverEnabled: enablePartitionLevelFailover); + + CancellationToken cancellationToken = new (); + HttpRequestException httpRequestException = new (message: "Connecting to endpoint has failed."); + + GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection( + this.partitionKeyRangeLocationCache, + request.RequestContext.ResolvedPartitionKeyRange); + + // Validate that the partition key range failover info is not present before the http request exception was captured in the retry policy. + Assert.IsNull(partitionKeyRangeFailoverInfo); + + retryPolicy.OnBeforeSendRequest(request); + Task retryStatus = retryPolicy.ShouldRetryAsync(httpRequestException, cancellationToken); + + Assert.IsTrue(retryStatus.Result.ShouldRetry); + + partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection( + this.partitionKeyRangeLocationCache, + request.RequestContext.ResolvedPartitionKeyRange); + + if (enablePartitionLevelFailover) + { + // Validate that the partition key range failover info to the next account region is present after the http request exception was captured in the retry policy. + Assert.AreEqual(partitionKeyRangeFailoverInfo.Current, readLocations[1]); + } + else + { + Assert.IsNull(partitionKeyRangeFailoverInfo); + } } [TestMethod] @@ -170,7 +239,7 @@ public Task ClientRetryPolicy_NoRetry_MultiMaster_Read_NoPreferredLocations() public Task ClientRetryPolicy_NoRetry_MultiMaster_Write_NoPreferredLocations() { return this.ValidateConnectTimeoutTriggersClientRetryPolicy(isReadRequest: false, useMultipleWriteLocations: true, usesPreferredLocations: false, false); - } + } private async Task ValidateConnectTimeoutTriggersClientRetryPolicy( bool isReadRequest, @@ -284,7 +353,29 @@ await BackoffRetryUtility.ExecuteAsync( } } } - } + } + + private static GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo GetPartitionKeyRangeFailoverInfoUsingReflection( + GlobalPartitionEndpointManager globalPartitionEndpointManager, + PartitionKeyRange pkRange) + { + FieldInfo fieldInfo = globalPartitionEndpointManager + .GetType() + .GetField( + name: "PartitionKeyRangeToLocation", + bindingAttr: BindingFlags.Instance | BindingFlags.NonPublic); + + if (fieldInfo != null) + { + Lazy> partitionKeyRangeToLocation = (Lazy>)fieldInfo.GetValue(globalPartitionEndpointManager); + partitionKeyRangeToLocation.Value.TryGetValue(pkRange, out GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo partitionKeyRangeFailoverInfo); + + return partitionKeyRangeFailoverInfo; + } + + return null; + } + private static AccountProperties CreateDatabaseAccount( bool useMultipleWriteLocations, bool enforceSingleMasterSingleWriteLocation)