From 00d589f28325dde29e376bf21e2ad5483c407424 Mon Sep 17 00:00:00 2001 From: Are Almaas Date: Tue, 5 Nov 2024 12:12:43 +0100 Subject: [PATCH 1/4] fix: add timeout for health checks --- .../HealthChecks/RedisHealthCheck.cs | 15 ++++++++++++++ .../HealthChecks/EndpointsHealthCheck.cs | 20 +++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs b/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs index 71983e8e7..8501ec7c6 100644 --- a/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs +++ b/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs @@ -17,11 +17,26 @@ public async Task CheckHealthAsync(HealthCheckContext context { try { + using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + cts.CancelAfter(TimeSpan.FromSeconds(40)); + + var sw = System.Diagnostics.Stopwatch.StartNew(); using var redis = await ConnectionMultiplexer.ConnectAsync(_settings.Redis.ConnectionString); var db = redis.GetDatabase(); await db.PingAsync(); + sw.Stop(); + + if (sw.Elapsed > TimeSpan.FromSeconds(5)) + { + return HealthCheckResult.Degraded($"Redis connection is slow ({sw.Elapsed.TotalSeconds:N1}s)."); + } + return HealthCheckResult.Healthy("Redis connection is healthy."); } + catch (OperationCanceledException) + { + return HealthCheckResult.Unhealthy("Redis health check timed out after 40s."); + } catch (RedisConnectionException ex) { return HealthCheckResult.Unhealthy("Unable to connect to Redis.", exception: ex); diff --git a/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs b/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs index 98e7567ee..72bdcefb8 100644 --- a/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs +++ b/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs @@ -32,13 +32,29 @@ public async Task CheckHealthAsync( { try { - var response = await client.GetAsync(url, cancellationToken); - if (!response.IsSuccessStatusCode) + using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + cts.CancelAfter(TimeSpan.FromSeconds(40)); + + var sw = System.Diagnostics.Stopwatch.StartNew(); + var response = await client.GetAsync(url, cts.Token); + sw.Stop(); + + if (sw.Elapsed > TimeSpan.FromSeconds(5)) + { + _logger.LogWarning("Health check response was slow for endpoint: {Url}. Elapsed time: {Elapsed:N1}s", url, sw.Elapsed.TotalSeconds); + unhealthyEndpoints.Add($"{url} (Degraded - Response time: {sw.Elapsed.TotalSeconds:N1}s)"); + } + else if (!response.IsSuccessStatusCode) { _logger.LogWarning("Health check failed for endpoint: {Url}. Status Code: {StatusCode}", url, response.StatusCode); unhealthyEndpoints.Add($"{url} (Status Code: {response.StatusCode})"); } } + catch (OperationCanceledException) + { + _logger.LogWarning("Health check timed out for endpoint: {Url}", url); + unhealthyEndpoints.Add($"{url} (Timeout after 40s)"); + } catch (Exception ex) { _logger.LogError(ex, "Exception occurred while checking endpoint: {Url}", url); From eef32066b06d0111a72bf67696ba3fb47dee31a7 Mon Sep 17 00:00:00 2001 From: Are Almaas Date: Tue, 5 Nov 2024 12:41:04 +0100 Subject: [PATCH 2/4] fix: add timeout for health checks --- .../HealthChecks/RedisHealthCheck.cs | 17 +++++++++++------ .../HealthChecks/EndpointsHealthCheck.cs | 12 ++++++------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs b/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs index 8501ec7c6..32c6c4f36 100644 --- a/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs +++ b/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs @@ -17,11 +17,16 @@ public async Task CheckHealthAsync(HealthCheckContext context { try { - using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); - cts.CancelAfter(TimeSpan.FromSeconds(40)); - + var timeout = 15000; var sw = System.Diagnostics.Stopwatch.StartNew(); - using var redis = await ConnectionMultiplexer.ConnectAsync(_settings.Redis.ConnectionString); + + var options = ConfigurationOptions.Parse(_settings.Redis.ConnectionString); + + options.AsyncTimeout = timeout; + options.ConnectTimeout = timeout; + options.SyncTimeout = timeout; + + using var redis = await ConnectionMultiplexer.ConnectAsync(options); var db = redis.GetDatabase(); await db.PingAsync(); sw.Stop(); @@ -33,9 +38,9 @@ public async Task CheckHealthAsync(HealthCheckContext context return HealthCheckResult.Healthy("Redis connection is healthy."); } - catch (OperationCanceledException) + catch (RedisTimeoutException ex) { - return HealthCheckResult.Unhealthy("Redis health check timed out after 40s."); + return HealthCheckResult.Unhealthy("Redis connection timed out.", exception: ex); } catch (RedisConnectionException ex) { diff --git a/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs b/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs index 72bdcefb8..3a5622b76 100644 --- a/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs +++ b/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs @@ -39,16 +39,16 @@ public async Task CheckHealthAsync( var response = await client.GetAsync(url, cts.Token); sw.Stop(); - if (sw.Elapsed > TimeSpan.FromSeconds(5)) - { - _logger.LogWarning("Health check response was slow for endpoint: {Url}. Elapsed time: {Elapsed:N1}s", url, sw.Elapsed.TotalSeconds); - unhealthyEndpoints.Add($"{url} (Degraded - Response time: {sw.Elapsed.TotalSeconds:N1}s)"); - } - else if (!response.IsSuccessStatusCode) + if (!response.IsSuccessStatusCode) { _logger.LogWarning("Health check failed for endpoint: {Url}. Status Code: {StatusCode}", url, response.StatusCode); unhealthyEndpoints.Add($"{url} (Status Code: {response.StatusCode})"); } + else if (sw.Elapsed > TimeSpan.FromSeconds(5)) + { + _logger.LogWarning("Health check response was slow for endpoint: {Url}. Elapsed time: {Elapsed:N1}s", url, sw.Elapsed.TotalSeconds); + unhealthyEndpoints.Add($"{url} (Degraded - Response time: {sw.Elapsed.TotalSeconds:N1}s)"); + } } catch (OperationCanceledException) { From a7401a2dd27a112c66dd43dd4dada53eae1dcbd6 Mon Sep 17 00:00:00 2001 From: Are Almaas Date: Tue, 5 Nov 2024 12:47:12 +0100 Subject: [PATCH 3/4] cleanup --- .../HealthChecks/RedisHealthCheck.cs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs b/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs index 32c6c4f36..4a8c39ea0 100644 --- a/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs +++ b/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs @@ -1,24 +1,26 @@ using Microsoft.Extensions.Diagnostics.HealthChecks; using StackExchange.Redis; using Microsoft.Extensions.Options; - +using Microsoft.Extensions.Logging; namespace Digdir.Domain.Dialogporten.Infrastructure.HealthChecks; internal sealed class RedisHealthCheck : IHealthCheck { private readonly InfrastructureSettings _settings; + private readonly ILogger _logger; - public RedisHealthCheck(IOptions options) + public RedisHealthCheck(IOptions options, ILogger logger) { _settings = options?.Value ?? throw new ArgumentNullException(nameof(options)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); } public async Task CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default) { + var sw = System.Diagnostics.Stopwatch.StartNew(); try { var timeout = 15000; - var sw = System.Diagnostics.Stopwatch.StartNew(); var options = ConfigurationOptions.Parse(_settings.Redis.ConnectionString); @@ -29,10 +31,10 @@ public async Task CheckHealthAsync(HealthCheckContext context using var redis = await ConnectionMultiplexer.ConnectAsync(options); var db = redis.GetDatabase(); await db.PingAsync(); - sw.Stop(); if (sw.Elapsed > TimeSpan.FromSeconds(5)) { + _logger.LogWarning("Redis connection is slow ({Elapsed:N1}s).", sw.Elapsed.TotalSeconds); return HealthCheckResult.Degraded($"Redis connection is slow ({sw.Elapsed.TotalSeconds:N1}s)."); } @@ -40,15 +42,22 @@ public async Task CheckHealthAsync(HealthCheckContext context } catch (RedisTimeoutException ex) { + _logger.LogWarning("Redis connection timed out ({Elapsed:N1}s).", sw.Elapsed.TotalSeconds); return HealthCheckResult.Unhealthy("Redis connection timed out.", exception: ex); } catch (RedisConnectionException ex) { + _logger.LogWarning(ex, "Unable to connect to Redis."); return HealthCheckResult.Unhealthy("Unable to connect to Redis.", exception: ex); } catch (Exception ex) { + _logger.LogError(ex, "An unexpected error occurred while checking Redis health."); return HealthCheckResult.Unhealthy("An unexpected error occurred while checking Redis health.", exception: ex); } + finally + { + sw.Stop(); + } } } \ No newline at end of file From e2fbcfad451b18f062636d9f75f4514f5601fa76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ole=20J=C3=B8rgen=20Skogstad?= Date: Fri, 8 Nov 2024 11:40:20 +0100 Subject: [PATCH 4/4] chore: Healtch check timeout suggestion (#1418) --- .../HealthChecks/RedisHealthCheck.cs | 32 ++++++++++--------- .../HealthChecks/EndpointsHealthCheck.cs | 17 ++++++---- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs b/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs index 4a8c39ea0..4639c0435 100644 --- a/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs +++ b/src/Digdir.Domain.Dialogporten.Infrastructure/HealthChecks/RedisHealthCheck.cs @@ -1,13 +1,16 @@ +using System.Diagnostics; using Microsoft.Extensions.Diagnostics.HealthChecks; using StackExchange.Redis; using Microsoft.Extensions.Options; using Microsoft.Extensions.Logging; + namespace Digdir.Domain.Dialogporten.Infrastructure.HealthChecks; internal sealed class RedisHealthCheck : IHealthCheck { private readonly InfrastructureSettings _settings; private readonly ILogger _logger; + private const int DegradationThresholdInSeconds = 5; public RedisHealthCheck(IOptions options, ILogger logger) { @@ -17,10 +20,10 @@ public RedisHealthCheck(IOptions options, ILogger CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default) { - var sw = System.Diagnostics.Stopwatch.StartNew(); + var startTime = Stopwatch.GetTimestamp(); try { - var timeout = 15000; + const int timeout = 15_000; var options = ConfigurationOptions.Parse(_settings.Redis.ConnectionString); @@ -28,22 +31,25 @@ public async Task CheckHealthAsync(HealthCheckContext context options.ConnectTimeout = timeout; options.SyncTimeout = timeout; - using var redis = await ConnectionMultiplexer.ConnectAsync(options); + await using var redis = await ConnectionMultiplexer.ConnectAsync(options); var db = redis.GetDatabase(); await db.PingAsync(); - if (sw.Elapsed > TimeSpan.FromSeconds(5)) + var responseTime = Stopwatch.GetElapsedTime(startTime); + + if (responseTime > TimeSpan.FromSeconds(DegradationThresholdInSeconds)) { - _logger.LogWarning("Redis connection is slow ({Elapsed:N1}s).", sw.Elapsed.TotalSeconds); - return HealthCheckResult.Degraded($"Redis connection is slow ({sw.Elapsed.TotalSeconds:N1}s)."); + _logger.LogWarning("Redis connection is slow ({Elapsed:N1}s).", responseTime.TotalSeconds); + return HealthCheckResult.Degraded($"Redis connection is slow ({responseTime.TotalSeconds:N1}s)."); } return HealthCheckResult.Healthy("Redis connection is healthy."); } catch (RedisTimeoutException ex) { - _logger.LogWarning("Redis connection timed out ({Elapsed:N1}s).", sw.Elapsed.TotalSeconds); - return HealthCheckResult.Unhealthy("Redis connection timed out.", exception: ex); + var responseTime = Stopwatch.GetElapsedTime(startTime); + _logger.LogWarning("Redis connection timed out ({Elapsed:N1}s).", responseTime.TotalSeconds); + return HealthCheckResult.Unhealthy($"Redis connection timed out after {responseTime.TotalSeconds:N1}s.", exception: ex); } catch (RedisConnectionException ex) { @@ -52,12 +58,8 @@ public async Task CheckHealthAsync(HealthCheckContext context } catch (Exception ex) { - _logger.LogError(ex, "An unexpected error occurred while checking Redis health."); - return HealthCheckResult.Unhealthy("An unexpected error occurred while checking Redis health.", exception: ex); - } - finally - { - sw.Stop(); + _logger.LogError(ex, "An unexpected error occurred while checking Redis' health."); + return HealthCheckResult.Unhealthy("An unexpected error occurred while checking Redis' health.", exception: ex); } } -} \ No newline at end of file +} diff --git a/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs b/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs index 3a5622b76..61d6767ba 100644 --- a/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs +++ b/src/Digdir.Library.Utils.AspNet/HealthChecks/EndpointsHealthCheck.cs @@ -2,6 +2,7 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using System.Collections.Concurrent; +using System.Diagnostics; namespace Digdir.Library.Utils.AspNet.HealthChecks; @@ -10,6 +11,8 @@ internal sealed class EndpointsHealthCheck : IHealthCheck private readonly IHttpClientFactory _httpClientFactory; private readonly ILogger _logger; private readonly List _endpoints; + private const int DegradationThresholdInSeconds = 5; + private const int TimeoutInSeconds = 40; public EndpointsHealthCheck( IHttpClientFactory httpClientFactory, @@ -33,27 +36,27 @@ public async Task CheckHealthAsync( try { using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); - cts.CancelAfter(TimeSpan.FromSeconds(40)); + cts.CancelAfter(TimeSpan.FromSeconds(TimeoutInSeconds)); - var sw = System.Diagnostics.Stopwatch.StartNew(); + var startTime = Stopwatch.GetTimestamp(); var response = await client.GetAsync(url, cts.Token); - sw.Stop(); + var responseTime = Stopwatch.GetElapsedTime(startTime); if (!response.IsSuccessStatusCode) { _logger.LogWarning("Health check failed for endpoint: {Url}. Status Code: {StatusCode}", url, response.StatusCode); unhealthyEndpoints.Add($"{url} (Status Code: {response.StatusCode})"); } - else if (sw.Elapsed > TimeSpan.FromSeconds(5)) + else if (responseTime > TimeSpan.FromSeconds(DegradationThresholdInSeconds)) { - _logger.LogWarning("Health check response was slow for endpoint: {Url}. Elapsed time: {Elapsed:N1}s", url, sw.Elapsed.TotalSeconds); - unhealthyEndpoints.Add($"{url} (Degraded - Response time: {sw.Elapsed.TotalSeconds:N1}s)"); + _logger.LogWarning("Health check response was slow for endpoint: {Url}. Elapsed time: {Elapsed:N1}s", url, responseTime.TotalSeconds); + unhealthyEndpoints.Add($"{url} (Degraded - Response time: {responseTime.TotalSeconds:N1}s)"); } } catch (OperationCanceledException) { _logger.LogWarning("Health check timed out for endpoint: {Url}", url); - unhealthyEndpoints.Add($"{url} (Timeout after 40s)"); + unhealthyEndpoints.Add($"{url} (Timeout after {TimeoutInSeconds}s)"); } catch (Exception ex) {