Skip to content
This repository has been archived by the owner on Jul 30, 2024. It is now read-only.
/ NuGet.Jobs Public archive

Reboot search instances returning 500 and 503 #661

Merged
merged 1 commit into from
Nov 8, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/Monitoring.RebootSearchInstance/SearchInstanceRebooter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System;
using System.Diagnostics;
using System.Linq;
using System.Net;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
Expand Down Expand Up @@ -220,6 +221,20 @@ private async Task<InstanceHealth> DetermineInstanceHealthAsync(
{
commitDateTime = await _searchServiceClient.GetCommitDateTimeAsync(instance, token);
}
catch (HttpResponseException ex) when (ex.StatusCode == HttpStatusCode.ServiceUnavailable
|| ex.StatusCode == HttpStatusCode.InternalServerError)
{
_logger.LogInformation(
(EventId)0,
ex,
"The HTTP response when hitting {DiagUrl} was {StatusCode} {ReasonPhrase}. Considering this " +
"instance as an unhealthy state.",
instance.DiagUrl,
(int)ex.StatusCode,
ex.ReasonPhrase);

return InstanceHealth.Unhealthy;
}
catch (Exception ex)
{
_logger.LogInformation(
Expand Down
21 changes: 21 additions & 0 deletions src/PackageLagMonitor/HttpResponseException.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Net;

namespace NuGet.Jobs.Montoring.PackageLag
{
public class HttpResponseException : Exception
{
public HttpResponseException(HttpStatusCode statusCode, string reasonPhrase, string message)
: base(message)
{
StatusCode = statusCode;
ReasonPhrase = reasonPhrase;
}

public HttpStatusCode StatusCode { get; }
public string ReasonPhrase { get; }
}
}
1 change: 1 addition & 0 deletions src/PackageLagMonitor/Monitoring.PackageLag.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
</PropertyGroup>
<ItemGroup>
<Compile Include="AzureManagementAPIWrapperConfiguration.cs" />
<Compile Include="HttpResponseException.cs" />
<Compile Include="Instance.cs" />
<Compile Include="ISearchServiceClient.cs" />
<Compile Include="Job.cs" />
Expand Down
9 changes: 9 additions & 0 deletions src/PackageLagMonitor/SearchServiceClient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,15 @@ public async Task<DateTimeOffset> GetCommitDateTimeAsync(Instance instance, Canc
HttpCompletionOption.ResponseContentRead,
token))
{
if (!diagResponse.IsSuccessStatusCode)
{
throw new HttpResponseException(
diagResponse.StatusCode,
diagResponse.ReasonPhrase,
$"The HTTP response when hitting {instance.DiagUrl} was {(int)diagResponse.StatusCode} " +
$"{diagResponse.ReasonPhrase}, which is not successful.");
}

var diagContent = diagResponse.Content;
var searchDiagResultRaw = await diagContent.ReadAsStringAsync();
var searchDiagResultObject = JsonConvert.DeserializeObject<SearchDiagnosticResponse>(searchDiagResultRaw);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
Expand Down Expand Up @@ -169,7 +170,7 @@ public async Task RestartsFirstUnhealthyInstance()
}

[Fact]
public async Task TreatsExceptionWhenGettingCommitTimestampAsUnknown()
public async Task TreatsUnknownExceptionWhenGettingCommitTimestampAsUnknown()
{
_searchServiceClient
.SetupSequence(x => x.GetCommitDateTimeAsync(It.IsAny<Instance>(), It.IsAny<CancellationToken>()))
Expand All @@ -195,6 +196,74 @@ public async Task TreatsExceptionWhenGettingCommitTimestampAsUnknown()
_telemetryService.Verify(x => x.TrackInstanceCount(_region, 3), Times.Once);
}

[Theory]
[InlineData(HttpStatusCode.BadGateway)]
[InlineData(HttpStatusCode.NotFound)]
public async Task TreatsUnknownHttpStatusCodeExceptionWhenGettingCommitTimestampAsUnknown(HttpStatusCode statusCode)
{
_searchServiceClient
.SetupSequence(x => x.GetCommitDateTimeAsync(It.IsAny<Instance>(), It.IsAny<CancellationToken>()))
.ThrowsAsync(new HttpResponseException(statusCode, "Service Unavailable", "Some problem."))
.ReturnsAsync(DateTimeOffset.MaxValue)
.ReturnsAsync(DateTimeOffset.MaxValue);

await _target.RunAsync(_token);

_azureManagementAPIWrapper.Verify(
x => x.RebootCloudServiceRoleInstanceAsync(
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<CancellationToken>()),
Times.Never);
_telemetryService.Verify(x => x.TrackHealthyInstanceCount(_region, 2), Times.Once);
_telemetryService.Verify(x => x.TrackUnhealthyInstanceCount(_region, 0), Times.Once);
_telemetryService.Verify(x => x.TrackUnknownInstanceCount(_region, 1), Times.Once);
_telemetryService.Verify(x => x.TrackInstanceCount(_region, 3), Times.Once);
}

[Theory]
[InlineData(HttpStatusCode.InternalServerError)]
[InlineData(HttpStatusCode.ServiceUnavailable)]
public async Task TreatsSome500sHttpResponseExceptionAsUnhealthy(HttpStatusCode statusCode)
{
_searchServiceClient
.SetupSequence(x => x.GetCommitDateTimeAsync(It.IsAny<Instance>(), It.IsAny<CancellationToken>()))
.ThrowsAsync(new HttpResponseException(statusCode, "Service Unavailable", "Some problem."))
.ReturnsAsync(DateTimeOffset.MaxValue)
.ReturnsAsync(DateTimeOffset.MaxValue);

await _target.RunAsync(_token);

_azureManagementAPIWrapper.Verify(
x => x.RebootCloudServiceRoleInstanceAsync(
_subscription,
_resourceGroup,
_serviceName,
"Production",
_role,
It.IsAny<string>(),
It.IsAny<CancellationToken>()),
Times.Once);
_azureManagementAPIWrapper.Verify(
x => x.RebootCloudServiceRoleInstanceAsync(
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<CancellationToken>()),
Times.Once);
_telemetryService.Verify(x => x.TrackHealthyInstanceCount(_region, 2), Times.Once);
_telemetryService.Verify(x => x.TrackUnhealthyInstanceCount(_region, 1), Times.Once);
_telemetryService.Verify(x => x.TrackUnknownInstanceCount(_region, 0), Times.Once);
_telemetryService.Verify(x => x.TrackInstanceCount(_region, 3), Times.Once);
}

[Fact]
public async Task TreatsLagBetweenThresholdsAsUnknown()
{
Expand Down