From 1d1b94ff0cb7420eb41af003942228affa2a737b Mon Sep 17 00:00:00 2001 From: FHIBF Date: Tue, 11 Apr 2023 08:58:54 -0700 Subject: [PATCH 1/3] Adding cancellation token to health check + logic to retry transient exceptions --- .../Features/Health/CosmosHealthCheckTests.cs | 45 +++++++++++- .../Features/Health/CosmosHealthCheck.cs | 73 +++++++++++++------ 2 files changed, 96 insertions(+), 22 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs index 1191076396..41c3ca4b26 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs @@ -10,7 +10,6 @@ using System.Net.Http; using System.Threading; using System.Threading.Tasks; -using AngleSharp.Common; using Microsoft.Azure.Cosmos; using Microsoft.Extensions.Diagnostics.HealthChecks; using Microsoft.Extensions.Logging.Abstractions; @@ -59,6 +58,50 @@ public async Task GivenCosmosDbCanBeQueried_WhenHealthIsChecked_ThenHealthyState Assert.Equal(HealthStatus.Healthy, result.Status); } + [Fact] + public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsAlwaysThrown_ThenUnhealthyStateShouldBeReturned() + { + // This test simulates that all Health Check calls result in OperationCanceledExceptions. + // And all retries should fail. + + var diagnostics = Substitute.For(); + var coce = new CosmosOperationCanceledException(originalException: new OperationCanceledException(), diagnostics); + + _testProvider.PerformTestAsync(default, default, _cosmosCollectionConfiguration, CancellationToken.None).ThrowsForAnyArgs(coce); + HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext()); + + Assert.Equal(HealthStatus.Unhealthy, result.Status); + _testProvider.ReceivedWithAnyArgs(3); + } + + [Fact] + public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsOnceThrown_ThenUnhealthyStateShouldBeReturned() + { + // This test simulates that the first call to Health Check results in an OperationCanceledException. + // The first attempt should fail, but the next ones should pass. + + var diagnostics = Substitute.For(); + var coce = new CosmosOperationCanceledException(originalException: new OperationCanceledException(), diagnostics); + + int runs = 0; + Func fakeRetry = () => + { + runs++; + if (runs == 1) + { + throw coce; + } + + return Task.CompletedTask; + }; + + _testProvider.PerformTestAsync(default, default, _cosmosCollectionConfiguration, CancellationToken.None).ReturnsForAnyArgs(x => fakeRetry()); + HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext()); + + Assert.Equal(HealthStatus.Healthy, result.Status); + _testProvider.ReceivedWithAnyArgs(2); + } + [Fact] public async Task GivenCosmosDbCannotBeQueried_WhenHealthIsChecked_ThenUnhealthyStateShouldBeReturned() { diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs index 41a8c05730..a40b091ac3 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs @@ -57,31 +57,62 @@ public CosmosHealthCheck( public async Task CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default) { - try + const int maxExecutionTimeInSeconds = 30; + const int maxNumberAttempts = 3; + int attempt = 0; + do { - // Make a non-invasive query to make sure we can reach the data store. + cancellationToken.ThrowIfCancellationRequested(); + try + { + using (CancellationTokenSource timeBasedTokenSource = new CancellationTokenSource(TimeSpan.FromSeconds(maxExecutionTimeInSeconds))) + using (CancellationTokenSource operationTokenSource = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeBasedTokenSource.Token)) + { + await _testProvider.PerformTestAsync(_container.Value, _configuration, _cosmosCollectionConfiguration, operationTokenSource.Token); + return HealthCheckResult.Healthy("Successfully connected to the data store."); + } + } + catch (CosmosOperationCanceledException coce) + { + // CosmosOperationCanceledException are "safe to retry on and can be treated as timeouts from the retrying perspective.". + // Reference: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new + attempt++; - await _testProvider.PerformTestAsync(_container.Value, _configuration, _cosmosCollectionConfiguration, cancellationToken); + if (cancellationToken.IsCancellationRequested) + { + _logger.LogWarning(coce, "Failed to connect to the data store. External cancellation requested."); + return HealthCheckResult.Unhealthy("Failed to connect to the data store. External cancellation requested."); + } + else if (attempt >= maxNumberAttempts) + { + _logger.LogWarning(coce, "Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", attempt, nameof(CosmosOperationCanceledException)); + return HealthCheckResult.Unhealthy("Failed to connect to the data store. CosmosOperationCanceledException."); + } + else + { + // Number of attempts not reached. Allow retry. + _logger.LogWarning(coce, "Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", attempt, nameof(CosmosOperationCanceledException)); + } + } + catch (CosmosException ex) when (ex.IsCmkClientError()) + { + return HealthCheckResult.Unhealthy( + "Connection to the data store was unsuccesful because the client's customer-managed key is not available.", + exception: ex, + new Dictionary() { { "IsCustomerManagedKeyError", true } }); + } + catch (Exception ex) when (ex.IsRequestRateExceeded()) + { + return HealthCheckResult.Healthy("Connection to the data store was successful, however, the rate limit has been exceeded."); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to connect to the data store."); - return HealthCheckResult.Healthy("Successfully connected to the data store."); - } - catch (CosmosException ex) when (ex.IsCmkClientError()) - { - return HealthCheckResult.Unhealthy( - "Connection to the data store was unsuccesful because the client's customer-managed key is not available.", - exception: ex, - new Dictionary() { { "IsCustomerManagedKeyError", true } }); - } - catch (Exception ex) when (ex.IsRequestRateExceeded()) - { - return HealthCheckResult.Healthy("Connection to the data store was successful, however, the rate limit has been exceeded."); - } - catch (Exception ex) - { - _logger.LogWarning(ex, "Failed to connect to the data store."); - - return HealthCheckResult.Unhealthy("Failed to connect to the data store."); + return HealthCheckResult.Unhealthy("Failed to connect to the data store."); + } } + while (true); } } } From 469e93506bc6ea1c17a30fb35d0473a44d2fe816 Mon Sep 17 00:00:00 2001 From: FHIBF Date: Tue, 11 Apr 2023 10:56:24 -0700 Subject: [PATCH 2/3] Improving error message. --- .../Features/Health/CosmosHealthCheck.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs index a40b091ac3..9e59aa74e8 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs @@ -86,7 +86,7 @@ public async Task CheckHealthAsync(HealthCheckContext context else if (attempt >= maxNumberAttempts) { _logger.LogWarning(coce, "Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", attempt, nameof(CosmosOperationCanceledException)); - return HealthCheckResult.Unhealthy("Failed to connect to the data store. CosmosOperationCanceledException."); + return HealthCheckResult.Unhealthy("Failed to connect to the data store. Operation canceled."); } else { From f79a0d9019f28975ec11710aedb24a87e048919a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Henrique=20Inoc=C3=AAncio=20Borba=20Ferreira?= Date: Wed, 12 Apr 2023 10:04:07 -0700 Subject: [PATCH 3/3] Renaming test. --- .../Features/Health/CosmosHealthCheckTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs index 41c3ca4b26..b5b9be0e09 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs @@ -75,7 +75,7 @@ public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsAlwaysThro } [Fact] - public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsOnceThrown_ThenUnhealthyStateShouldBeReturned() + public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsOnceThrown_ThenHealthyStateShouldBeReturned() { // This test simulates that the first call to Health Check results in an OperationCanceledException. // The first attempt should fail, but the next ones should pass.