diff --git a/src/Monitoring.RebootSearchInstance/SearchInstanceRebooter.cs b/src/Monitoring.RebootSearchInstance/SearchInstanceRebooter.cs index ef61915a0..c64f76cc3 100644 --- a/src/Monitoring.RebootSearchInstance/SearchInstanceRebooter.cs +++ b/src/Monitoring.RebootSearchInstance/SearchInstanceRebooter.cs @@ -4,6 +4,7 @@ using System; using System.Diagnostics; using System.Linq; +using System.Net; using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging; @@ -220,6 +221,20 @@ private async Task DetermineInstanceHealthAsync( { commitDateTime = await _searchServiceClient.GetCommitDateTimeAsync(instance, token); } + catch (HttpResponseException ex) when (ex.StatusCode == HttpStatusCode.ServiceUnavailable + || ex.StatusCode == HttpStatusCode.InternalServerError) + { + _logger.LogInformation( + (EventId)0, + ex, + "The HTTP response when hitting {DiagUrl} was {StatusCode} {ReasonPhrase}. Considering this " + + "instance as an unhealthy state.", + instance.DiagUrl, + (int)ex.StatusCode, + ex.ReasonPhrase); + + return InstanceHealth.Unhealthy; + } catch (Exception ex) { _logger.LogInformation( diff --git a/src/PackageLagMonitor/HttpResponseException.cs b/src/PackageLagMonitor/HttpResponseException.cs new file mode 100644 index 000000000..d1a9e9262 --- /dev/null +++ b/src/PackageLagMonitor/HttpResponseException.cs @@ -0,0 +1,21 @@ +// Copyright (c) .NET Foundation. All rights reserved. +// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. + +using System; +using System.Net; + +namespace NuGet.Jobs.Montoring.PackageLag +{ + public class HttpResponseException : Exception + { + public HttpResponseException(HttpStatusCode statusCode, string reasonPhrase, string message) + : base(message) + { + StatusCode = statusCode; + ReasonPhrase = reasonPhrase; + } + + public HttpStatusCode StatusCode { get; } + public string ReasonPhrase { get; } + } +} diff --git a/src/PackageLagMonitor/Monitoring.PackageLag.csproj b/src/PackageLagMonitor/Monitoring.PackageLag.csproj index 051bb522e..9b58dc427 100644 --- a/src/PackageLagMonitor/Monitoring.PackageLag.csproj +++ b/src/PackageLagMonitor/Monitoring.PackageLag.csproj @@ -48,6 +48,7 @@ + diff --git a/src/PackageLagMonitor/SearchServiceClient.cs b/src/PackageLagMonitor/SearchServiceClient.cs index 16b0f5eb4..61cc15a10 100644 --- a/src/PackageLagMonitor/SearchServiceClient.cs +++ b/src/PackageLagMonitor/SearchServiceClient.cs @@ -46,6 +46,15 @@ public async Task GetCommitDateTimeAsync(Instance instance, Canc HttpCompletionOption.ResponseContentRead, token)) { + if (!diagResponse.IsSuccessStatusCode) + { + throw new HttpResponseException( + diagResponse.StatusCode, + diagResponse.ReasonPhrase, + $"The HTTP response when hitting {instance.DiagUrl} was {(int)diagResponse.StatusCode} " + + $"{diagResponse.ReasonPhrase}, which is not successful."); + } + var diagContent = diagResponse.Content; var searchDiagResultRaw = await diagContent.ReadAsStringAsync(); var searchDiagResultObject = JsonConvert.DeserializeObject(searchDiagResultRaw); diff --git a/tests/Monitoring.RebootSearchInstance.Tests/SearchInstanceRebooterFacts.cs b/tests/Monitoring.RebootSearchInstance.Tests/SearchInstanceRebooterFacts.cs index ccef2e6ae..9d882d64c 100644 --- a/tests/Monitoring.RebootSearchInstance.Tests/SearchInstanceRebooterFacts.cs +++ b/tests/Monitoring.RebootSearchInstance.Tests/SearchInstanceRebooterFacts.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.Net; using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging; @@ -169,7 +170,7 @@ public async Task RestartsFirstUnhealthyInstance() } [Fact] - public async Task TreatsExceptionWhenGettingCommitTimestampAsUnknown() + public async Task TreatsUnknownExceptionWhenGettingCommitTimestampAsUnknown() { _searchServiceClient .SetupSequence(x => x.GetCommitDateTimeAsync(It.IsAny(), It.IsAny())) @@ -195,6 +196,74 @@ public async Task TreatsExceptionWhenGettingCommitTimestampAsUnknown() _telemetryService.Verify(x => x.TrackInstanceCount(_region, 3), Times.Once); } + [Theory] + [InlineData(HttpStatusCode.BadGateway)] + [InlineData(HttpStatusCode.NotFound)] + public async Task TreatsUnknownHttpStatusCodeExceptionWhenGettingCommitTimestampAsUnknown(HttpStatusCode statusCode) + { + _searchServiceClient + .SetupSequence(x => x.GetCommitDateTimeAsync(It.IsAny(), It.IsAny())) + .ThrowsAsync(new HttpResponseException(statusCode, "Service Unavailable", "Some problem.")) + .ReturnsAsync(DateTimeOffset.MaxValue) + .ReturnsAsync(DateTimeOffset.MaxValue); + + await _target.RunAsync(_token); + + _azureManagementAPIWrapper.Verify( + x => x.RebootCloudServiceRoleInstanceAsync( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny()), + Times.Never); + _telemetryService.Verify(x => x.TrackHealthyInstanceCount(_region, 2), Times.Once); + _telemetryService.Verify(x => x.TrackUnhealthyInstanceCount(_region, 0), Times.Once); + _telemetryService.Verify(x => x.TrackUnknownInstanceCount(_region, 1), Times.Once); + _telemetryService.Verify(x => x.TrackInstanceCount(_region, 3), Times.Once); + } + + [Theory] + [InlineData(HttpStatusCode.InternalServerError)] + [InlineData(HttpStatusCode.ServiceUnavailable)] + public async Task TreatsSome500sHttpResponseExceptionAsUnhealthy(HttpStatusCode statusCode) + { + _searchServiceClient + .SetupSequence(x => x.GetCommitDateTimeAsync(It.IsAny(), It.IsAny())) + .ThrowsAsync(new HttpResponseException(statusCode, "Service Unavailable", "Some problem.")) + .ReturnsAsync(DateTimeOffset.MaxValue) + .ReturnsAsync(DateTimeOffset.MaxValue); + + await _target.RunAsync(_token); + + _azureManagementAPIWrapper.Verify( + x => x.RebootCloudServiceRoleInstanceAsync( + _subscription, + _resourceGroup, + _serviceName, + "Production", + _role, + It.IsAny(), + It.IsAny()), + Times.Once); + _azureManagementAPIWrapper.Verify( + x => x.RebootCloudServiceRoleInstanceAsync( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny()), + Times.Once); + _telemetryService.Verify(x => x.TrackHealthyInstanceCount(_region, 2), Times.Once); + _telemetryService.Verify(x => x.TrackUnhealthyInstanceCount(_region, 1), Times.Once); + _telemetryService.Verify(x => x.TrackUnknownInstanceCount(_region, 0), Times.Once); + _telemetryService.Verify(x => x.TrackInstanceCount(_region, 3), Times.Once); + } + [Fact] public async Task TreatsLagBetweenThresholdsAsUnknown() {