Skip to content

Commit

Permalink
Merged PR 937777: Adjust logging when commands in plan runner fail
Browse files Browse the repository at this point in the history
Adjust logging when commands in plan runner fail

Related work items: #2506148
  • Loading branch information
avranju committed Jul 3, 2018
1 parent 6a9034d commit 29fd10e
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public Agent(IConfigSource configSource, IEnvironmentProvider environmentProvide

public static async Task<Agent> Create(IConfigSource configSource, IPlanner planner, IPlanRunner planRunner, IReporter reporter,
IModuleIdentityLifecycleManager moduleIdentityLifecycleManager, IEnvironmentProvider environmentProvider,
IEntityStore<string, string> configStore, ISerde<DeploymentConfigInfo> deploymentConfigInfoSerde, IEncryptionProvider encryptionProvider)
IEntityStore<string, string> configStore, ISerde<DeploymentConfigInfo> deploymentConfigInfoSerde, IEncryptionProvider encryptionProvider)
{
Preconditions.CheckNotNull(deploymentConfigInfoSerde, nameof(deploymentConfigInfoSerde));
Preconditions.CheckNotNull(configStore, nameof(configStore));
Expand Down Expand Up @@ -182,7 +182,6 @@ public async Task ReconcileAsync(CancellationToken token)
await this.UpdateCurrentConfig(deploymentConfigInfo);
throw;
}

}
}
else
Expand Down Expand Up @@ -219,7 +218,7 @@ public async Task ReconcileAsync(CancellationToken token)
}
}

// This should be called only within the reconcile lock.
// This should be called only within the reconcile lock.
private async Task UpdateCurrentConfig(DeploymentConfigInfo deploymentConfigInfo)
{
this.environment = this.environmentProvider.Create(deploymentConfigInfo.DeploymentConfig);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public class OrderedRetryPlanRunner : IPlanRunner
{
readonly AsyncLock sync;
long lastDeploymentId;
Dictionary<string, (int RunCount, DateTime LastRunTimeUtc)> commandRunStatus;
Dictionary<string, CommandRunStats> commandRunStatus;
readonly int maxRunCount;
readonly int coolOffTimeUnitInSeconds;
readonly ISystemTime systemTime;
Expand All @@ -30,7 +30,7 @@ public OrderedRetryPlanRunner(int maxRunCount, int coolOffTimeUnitInSeconds, ISy
this.systemTime = Preconditions.CheckNotNull(systemTime, nameof(systemTime));
this.sync = new AsyncLock();
this.lastDeploymentId = -1;
this.commandRunStatus = new Dictionary<string, (int RunCount, DateTime LastRunTimeUtc)>();
this.commandRunStatus = new Dictionary<string, CommandRunStats>();
}

public async Task<bool> ExecuteAsync(long deploymentId, Plan plan, CancellationToken token)
Expand Down Expand Up @@ -68,6 +68,7 @@ public async Task<bool> ExecuteAsync(long deploymentId, Plan plan, CancellationT
bool skippedModules = false;
foreach (ICommand command in plan.Commands)
{
var (shouldRun, runCount, coolOffPeriod, elapsedTime) = this.ShouldRunCommand(command);
try
{
if (token.IsCancellationRequested)
Expand All @@ -76,7 +77,6 @@ public async Task<bool> ExecuteAsync(long deploymentId, Plan plan, CancellationT
break;
}

var (shouldRun, runCount, coolOffPeriod, elapsedTime) = this.ShouldRunCommand(command);
if (shouldRun)
{
await command.ExecuteAsync(token);
Expand All @@ -85,31 +85,28 @@ public async Task<bool> ExecuteAsync(long deploymentId, Plan plan, CancellationT
// run status
if (this.commandRunStatus.ContainsKey(command.Id))
{
this.commandRunStatus[command.Id] = (0, this.systemTime.UtcNow);
this.commandRunStatus[command.Id] = CommandRunStats.Default;
}
}
else
{
skippedModules = true;
Events.SkippingCommand(deploymentId, command, runCount, this.maxRunCount, coolOffPeriod, elapsedTime);
Events.SkippingCommand(deploymentId, command, this.commandRunStatus[command.Id], this.maxRunCount, coolOffPeriod, elapsedTime);
}
}
catch (Exception ex) when (ex.IsFatal() == false)
{
Events.PlanExecStepFailed(deploymentId, command);
Events.PlanExecStepFailed(deploymentId, command, coolOffPeriod, elapsedTime);
if (!failures.HasValue)
{
failures = Option.Some(new List<Exception>());
}
failures.ForEach(f => f.Add(ex));

// since this command failed, record its status
int runCount = 0;
if (this.commandRunStatus.ContainsKey(command.Id))
{
runCount = this.commandRunStatus[command.Id].RunCount;
}
this.commandRunStatus[command.Id] = (runCount + 1, this.systemTime.UtcNow);
int newRunCount = this.commandRunStatus.ContainsKey(command.Id) ?
this.commandRunStatus[command.Id].RunCount : 0;
this.commandRunStatus[command.Id] = new CommandRunStats(newRunCount + 1, this.systemTime.UtcNow, ex);
}
}

Expand All @@ -133,21 +130,39 @@ TimeSpan elapsedTime
return (true, -1, TimeSpan.MinValue, TimeSpan.MinValue);
}

var (runCount, lastRunTimeUtc) = this.commandRunStatus[command.Id];
CommandRunStats commandRunStatus = this.commandRunStatus[command.Id];

// if this command has been run maxRunCount times already then don't
// run it anymore
if (runCount == this.maxRunCount)
if (commandRunStatus.RunCount == this.maxRunCount)
{
return (false, runCount, TimeSpan.MinValue, TimeSpan.MinValue);
return (false, commandRunStatus.RunCount, TimeSpan.MinValue, TimeSpan.MinValue);
}

TimeSpan coolOffPeriod = TimeSpan.FromSeconds(
this.coolOffTimeUnitInSeconds * Math.Pow(2, runCount)
this.coolOffTimeUnitInSeconds * Math.Pow(2, commandRunStatus.RunCount)
);
TimeSpan elapsedTime = this.systemTime.UtcNow - lastRunTimeUtc;
TimeSpan elapsedTime = this.systemTime.UtcNow - commandRunStatus.LastRunTimeUtc;

return (elapsedTime > coolOffPeriod, runCount, coolOffPeriod, elapsedTime);
return (elapsedTime > coolOffPeriod, commandRunStatus.RunCount, coolOffPeriod, elapsedTime);
}

class CommandRunStats
{
public int RunCount { get; }
public DateTime LastRunTimeUtc { get; }
public Option<Exception> Exception { get; }
public bool LoggedWarning { get; set; }

public static readonly CommandRunStats Default = new CommandRunStats(0, DateTime.MinValue);

public CommandRunStats(int runCount, DateTime lastRunTimeUtc, Exception exception = null)
{
this.RunCount = runCount;
this.LastRunTimeUtc = lastRunTimeUtc;
this.Exception = Option.Maybe(exception);
this.LoggedWarning = false;
}
}

static class Events
Expand Down Expand Up @@ -178,23 +193,31 @@ public static void NewDeployment(long deploymentId) =>
public static void OldDeployment(long deploymentId) =>
Log.LogDebug((int)EventIds.OldDeployment, $"Running plan on existing deployment {deploymentId}");

public static void SkippingCommand(long deploymentId, ICommand command, int runCount, int maxRunCount, TimeSpan coolOffPeriod, TimeSpan elapsedTime)
public static void SkippingCommand(long deploymentId, ICommand command, CommandRunStats runStats, int maxRunCount, TimeSpan coolOffPeriod, TimeSpan elapsedTime)
{
string msg = $"Skipping command \"{command.Show()}\" in deployment {deploymentId}.";
if (runCount == maxRunCount)
runStats.Exception.ForEach(ex => msg += $" Error: {ex.Message}.");

if (runStats.RunCount == maxRunCount)
{
msg += $" Command has been tried {runCount} times. Max run count is {maxRunCount}.";
if (runStats.LoggedWarning == false)
{
msg += $" Command has been tried {runStats.RunCount} times. Giving up (max run count is {maxRunCount}).";
Log.LogWarning((int)EventIds.SkippingCommand, msg);
runStats.LoggedWarning = true;
}
}
else
{
msg += $" Will retry in {(coolOffPeriod - elapsedTime).Humanize()}";
Log.LogDebug((int)EventIds.SkippingCommand, msg);
}

Log.LogInformation((int)EventIds.SkippingCommand, msg);
}

public static void PlanExecStepFailed(long deploymentId, ICommand command) =>
Log.LogError((int)EventIds.PlanExecStepFailed, $"Step failed in deployment {deploymentId}, continuing execution. Failure when running command {command.Show()}");
public static void PlanExecStepFailed(long deploymentId, ICommand command, TimeSpan coolOffPeriod, TimeSpan elapsedTime) =>
Log.LogError(
(int)EventIds.PlanExecStepFailed,
$"Step failed in deployment {deploymentId}, continuing execution. Failure when running command {command.Show()}. Will retry in {(coolOffPeriod - elapsedTime).Humanize()}.");

public static void PlanExecEnded(long deploymentId)
{
Expand Down

0 comments on commit 29fd10e

Please sign in to comment.