Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Health Check] Limit of 30 seconds + retry #3224

Merged
merged 3 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
using System.Net.Http;
using System.Threading;
using System.Threading.Tasks;
using AngleSharp.Common;
using Microsoft.Azure.Cosmos;
using Microsoft.Extensions.Diagnostics.HealthChecks;
using Microsoft.Extensions.Logging.Abstractions;
Expand Down Expand Up @@ -59,6 +58,50 @@ public async Task GivenCosmosDbCanBeQueried_WhenHealthIsChecked_ThenHealthyState
Assert.Equal(HealthStatus.Healthy, result.Status);
}

[Fact]
public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsAlwaysThrown_ThenUnhealthyStateShouldBeReturned()
{
// This test simulates that all Health Check calls result in OperationCanceledExceptions.
// And all retries should fail.

var diagnostics = Substitute.For<CosmosDiagnostics>();
var coce = new CosmosOperationCanceledException(originalException: new OperationCanceledException(), diagnostics);

_testProvider.PerformTestAsync(default, default, _cosmosCollectionConfiguration, CancellationToken.None).ThrowsForAnyArgs(coce);
HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

Assert.Equal(HealthStatus.Unhealthy, result.Status);
_testProvider.ReceivedWithAnyArgs(3);
}

[Fact]
public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsOnceThrown_ThenHealthyStateShouldBeReturned()
{
// This test simulates that the first call to Health Check results in an OperationCanceledException.
// The first attempt should fail, but the next ones should pass.

var diagnostics = Substitute.For<CosmosDiagnostics>();
var coce = new CosmosOperationCanceledException(originalException: new OperationCanceledException(), diagnostics);

int runs = 0;
Func<Task> fakeRetry = () =>
{
runs++;
if (runs == 1)
{
throw coce;
}

return Task.CompletedTask;
};

_testProvider.PerformTestAsync(default, default, _cosmosCollectionConfiguration, CancellationToken.None).ReturnsForAnyArgs(x => fakeRetry());
HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

Assert.Equal(HealthStatus.Healthy, result.Status);
_testProvider.ReceivedWithAnyArgs(2);
}

[Fact]
public async Task GivenCosmosDbCannotBeQueried_WhenHealthIsChecked_ThenUnhealthyStateShouldBeReturned()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,31 +57,62 @@ public CosmosHealthCheck(

public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default)
{
try
const int maxExecutionTimeInSeconds = 30;
const int maxNumberAttempts = 3;
int attempt = 0;
do
{
// Make a non-invasive query to make sure we can reach the data store.
cancellationToken.ThrowIfCancellationRequested();
try
{
using (CancellationTokenSource timeBasedTokenSource = new CancellationTokenSource(TimeSpan.FromSeconds(maxExecutionTimeInSeconds)))
using (CancellationTokenSource operationTokenSource = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeBasedTokenSource.Token))
{
await _testProvider.PerformTestAsync(_container.Value, _configuration, _cosmosCollectionConfiguration, operationTokenSource.Token);
return HealthCheckResult.Healthy("Successfully connected to the data store.");
}
}
catch (CosmosOperationCanceledException coce)
{
// CosmosOperationCanceledException are "safe to retry on and can be treated as timeouts from the retrying perspective.".
// Reference: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new
attempt++;

await _testProvider.PerformTestAsync(_container.Value, _configuration, _cosmosCollectionConfiguration, cancellationToken);
if (cancellationToken.IsCancellationRequested)
{
_logger.LogWarning(coce, "Failed to connect to the data store. External cancellation requested.");
return HealthCheckResult.Unhealthy("Failed to connect to the data store. External cancellation requested.");
}
else if (attempt >= maxNumberAttempts)
{
_logger.LogWarning(coce, "Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", attempt, nameof(CosmosOperationCanceledException));
return HealthCheckResult.Unhealthy("Failed to connect to the data store. Operation canceled.");
}
else
{
// Number of attempts not reached. Allow retry.
_logger.LogWarning(coce, "Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", attempt, nameof(CosmosOperationCanceledException));
}
}
catch (CosmosException ex) when (ex.IsCmkClientError())
{
return HealthCheckResult.Unhealthy(
"Connection to the data store was unsuccesful because the client's customer-managed key is not available.",
exception: ex,
new Dictionary<string, object>() { { "IsCustomerManagedKeyError", true } });
}
catch (Exception ex) when (ex.IsRequestRateExceeded())
{
return HealthCheckResult.Healthy("Connection to the data store was successful, however, the rate limit has been exceeded.");
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to connect to the data store.");

return HealthCheckResult.Healthy("Successfully connected to the data store.");
}
catch (CosmosException ex) when (ex.IsCmkClientError())
{
return HealthCheckResult.Unhealthy(
"Connection to the data store was unsuccesful because the client's customer-managed key is not available.",
exception: ex,
new Dictionary<string, object>() { { "IsCustomerManagedKeyError", true } });
}
catch (Exception ex) when (ex.IsRequestRateExceeded())
{
return HealthCheckResult.Healthy("Connection to the data store was successful, however, the rate limit has been exceeded.");
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to connect to the data store.");

return HealthCheckResult.Unhealthy("Failed to connect to the data store.");
return HealthCheckResult.Unhealthy("Failed to connect to the data store.");
}
}
while (true);
}
}
}