Skip to content

Commit

Permalink
Fix unnecessary decrement of maxTotalUses and add clarity to logs aro…
Browse files Browse the repository at this point in the history
…und terminations triggered by plugin (#375)

* rename EC2TerminationCause to EC2ExecutorInterruptionCause for clarity

* -add and track EC2AgentTerminationReason when plugin triggers termination
-rename force to overrideOtherSettings in scheduleToTerminate for clarity

#363

* [fix] remove unnecessary decrement which could lead to -1 (has special meaning)

[fix] remove misleading log and redundant set

#363

* update and add tests

* rename overrideOtherSettings to ignoreMinConstraints
  • Loading branch information
pdk27 authored Jun 23, 2023
1 parent ba2f4b5 commit 9450fa8
Show file tree
Hide file tree
Showing 15 changed files with 290 additions and 150 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ protected AbstractEC2FleetCloud(String name) {

public abstract boolean hasExcessCapacity();

public abstract boolean scheduleToTerminate(String instanceId, boolean force);
public abstract boolean scheduleToTerminate(String instanceId, boolean ignoreMinConstraints, EC2AgentTerminationReason reason);

public abstract String getOldId();

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package com.amazon.jenkins.ec2fleet;

/**
* Enum to represent the reason for termination of an EC2 instance by the plugin.
*/
public enum EC2AgentTerminationReason {
IDLE_FOR_TOO_LONG("Agent idle for too long"),
MAX_TOTAL_USES_EXHAUSTED("MaxTotalUses exhausted for agent"),
EXCESS_CAPACITY("Excess capacity for fleet"),
AGENT_DELETED("Agent deleted");

private final String description;

EC2AgentTerminationReason(String description) {
this.description = description;
}

public String getDescription() {
return description;
}

public static EC2AgentTerminationReason fromDescription(String desc) {
for (EC2AgentTerminationReason reason: values()) {
if(reason.description.equalsIgnoreCase(desc)) {
return reason;
}
}
return null;
}

@Override
public String toString() {
return this.description;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

import javax.annotation.Nonnull;

public class EC2TerminationCause extends CauseOfInterruption {
public class EC2ExecutorInterruptionCause extends CauseOfInterruption {

@Nonnull
private final String nodeName;

@SuppressWarnings("WeakerAccess")
public EC2TerminationCause(@Nonnull String nodeName) {
public EC2ExecutorInterruptionCause(@Nonnull String nodeName) {
this.nodeName = nodeName;
}

Expand All @@ -22,7 +22,7 @@ public String getShortDescription() {
@Override
public boolean equals(Object o) {
if (o == null || getClass() != o.getClass()) return false;
EC2TerminationCause that = (EC2TerminationCause) o;
EC2ExecutorInterruptionCause that = (EC2ExecutorInterruptionCause) o;
return nodeName.equals(that.nodeName);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ public void afterDisconnect(final SlaveComputer computer, final TaskListener lis
for (Executor executor : executors) {
final Queue.Executable executable = executor.getCurrentExecutable();
if (executable != null) {
executor.interrupt(Result.ABORTED, new EC2TerminationCause(computer.getDisplayName()));
executor.interrupt(Result.ABORTED, new EC2ExecutorInterruptionCause(computer.getDisplayName()));

final SubTask subTask = executable.getParent();
final Queue.Task task = subTask.getOwnerTask();
Expand Down
45 changes: 24 additions & 21 deletions src/main/java/com/amazon/jenkins/ec2fleet/EC2FleetCloud.java
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ public class EC2FleetCloud extends AbstractEC2FleetCloud {
/**
* {@link EC2FleetCloud#update()} updating this field, this is one thread
* related to {@link CloudNanny}. At the same time {@link EC2RetentionStrategy}
* call {@link EC2FleetCloud#scheduleToTerminate(String, boolean)} to terminate instance when it is free
* call {@link EC2FleetCloud#scheduleToTerminate(String, boolean, EC2AgentTerminationReason)} to terminate instance when it is free
* and uses this field to know the current capacity.
* <p>
* It could be situation that <code>stats</code> is outdated and plugin will make wrong decision,
Expand All @@ -160,7 +160,7 @@ public class EC2FleetCloud extends AbstractEC2FleetCloud {

private transient int toAdd;

private transient Set<String> instanceIdsToTerminate;
private transient Map<String, EC2AgentTerminationReason> instanceIdsToTerminate;

private transient Set<NodeProvisioner.PlannedNode> plannedNodesCache;

Expand Down Expand Up @@ -363,7 +363,7 @@ synchronized void setPlannedNodeScheduledFutures(final ArrayList<ScheduledFuture
}

// Visible for testing
synchronized Set<String> getInstanceIdsToTerminate() {
synchronized Map<String, EC2AgentTerminationReason> getInstanceIdsToTerminate() {
return instanceIdsToTerminate;
}

Expand Down Expand Up @@ -490,7 +490,7 @@ public FleetStateStats update() {
}
}
final int currentToAdd;
final Set<String> currentInstanceIdsToTerminate;
final Map<String, EC2AgentTerminationReason> currentInstanceIdsToTerminate;
synchronized (this) {
if(minSpareSize > 0) {
// Check spare instances by considering FleetStateStats#getNumDesired so we account for newer instances which are in progress
Expand All @@ -502,14 +502,14 @@ public FleetStateStats update() {
}
}
currentToAdd = toAdd;
currentInstanceIdsToTerminate = new HashSet<>(instanceIdsToTerminate);
currentInstanceIdsToTerminate = new HashMap<>(instanceIdsToTerminate);
}

currentState = updateByState(currentToAdd, currentInstanceIdsToTerminate, currentState);

// lock and update state of plugin, so terminate or provision could work with new state of world
synchronized (this) {
instanceIdsToTerminate.removeAll(currentInstanceIdsToTerminate);
instanceIdsToTerminate.keySet().removeAll(currentInstanceIdsToTerminate.keySet());
// toAdd only grows outside of this method, so we can subtract
toAdd = toAdd - currentToAdd;
fine("setting stats");
Expand Down Expand Up @@ -551,7 +551,7 @@ public boolean removePlannedNodeScheduledFutures(final int numToRemove) {
}

private FleetStateStats updateByState(
final int currentToAdd, final Set<String> currentInstanceIdsToTerminate, final FleetStateStats currentState) {
final int currentToAdd, final Map<String, EC2AgentTerminationReason> currentInstanceIdsToTerminate, final FleetStateStats currentState) {
final Jenkins jenkins = Jenkins.get();
final AmazonEC2 ec2 = Registry.getEc2Api().connect(getAwsCredentialsId(), region, endpoint);

Expand All @@ -577,20 +577,21 @@ private FleetStateStats updateByState(
Queue.withLock(new Runnable() {
@Override
public void run() {
for (final String instanceId : currentInstanceIdsToTerminate) {
info("Removing Jenkins nodes before terminating corresponding EC2 instances");
for (final String instanceId : currentInstanceIdsToTerminate.keySet()) {
final Node node = jenkins.getNode(instanceId);
if (node != null) {
try {
jenkins.removeNode(node);
} catch (IOException e) {
warning("Failed to remove node '%s' from Jenkins before termination", instanceId);
warning("Failed to remove node '%s' from Jenkins before termination.", instanceId);
}
}
}
}
});
info("Terminating nodes: %s", currentInstanceIdsToTerminate);
Registry.getEc2Api().terminateInstances(ec2, currentInstanceIdsToTerminate);
Registry.getEc2Api().terminateInstances(ec2, currentInstanceIdsToTerminate.keySet());
info("Terminated instances: %s", currentInstanceIdsToTerminate);
}

fine("Fleet instances: %s", updatedState.getInstances());
Expand All @@ -600,7 +601,7 @@ public void run() {
final Map<String, Instance> described = Registry.getEc2Api().describeInstances(ec2, fleetInstances);

// Sometimes described includes just deleted instances
described.keySet().removeAll(currentInstanceIdsToTerminate);
described.keySet().removeAll(currentInstanceIdsToTerminate.keySet());
fine("Described instances: %s", described.keySet());

// Fleet takes a while to display terminated instances. Update stats with current view of active instance count
Expand Down Expand Up @@ -693,8 +694,8 @@ public void run() {

/**
* Schedules Jenkins Node and EC2 instance for termination.
* If <code>force</code> is false and target capacity falls below <code>minSize</code> OR <code>minSpareSize</code> thresholds, then reject termination.
* Else if <code>force</code> is true, schedule instance for termination even if it breaches <code>minSize</code> OR <code>minSpareSize</code>
* If <code>ignoreMinConstraints</code> is false and target capacity falls below <code>minSize</code> OR <code>minSpareSize</code> thresholds, then reject termination.
* Else if <code>ignoreMinConstraints</code> is true, schedule instance for termination even if it breaches <code>minSize</code> OR <code>minSpareSize</code>
* <p>
* Real termination will happens in {@link EC2FleetCloud#update()} which is periodically called by
* {@link CloudNanny}. So there could be some lag between the decision to terminate the node
Expand All @@ -705,16 +706,18 @@ public void run() {
* to AWS EC2 API which takes some time and block cloud class.
*
* @param instanceId node name or instance ID
* @param force terminate instance even if it breaches min size constraint
* @param ignoreMinConstraints terminate instance even if it breaches min size constraint
* @param reason reason for termination
* @return <code>true</code> if node scheduled for termination, otherwise <code>false</code>
*/
public synchronized boolean scheduleToTerminate(final String instanceId, final boolean force) {
public synchronized boolean scheduleToTerminate(final String instanceId, final boolean ignoreMinConstraints,
final EC2AgentTerminationReason reason) {
if (stats == null) {
info("First update not done, skipping termination scheduling for '%s'", instanceId);
return false;
}
// We can't remove instances beyond minSize or minSpareSize unless force true
if(!force) {
// We can't remove instances beyond minSize or minSpareSize unless ignoreMinConstraints true
if(!ignoreMinConstraints) {
if (minSize > 0 && stats.getNumActive() - instanceIdsToTerminate.size() <= minSize) {
info("Not scheduling instance '%s' for termination because we need a minimum of %s instance(s) running", instanceId, minSize);
fine("cloud: %s, instanceIdsToTerminate: %s", this, instanceIdsToTerminate);
Expand All @@ -729,8 +732,8 @@ public synchronized boolean scheduleToTerminate(final String instanceId, final b
}
}
}
info("Scheduling instance '%s' for termination on cloud %s with force: %b", instanceId, this, force);
instanceIdsToTerminate.add(instanceId);
info("Scheduling instance '%s' for termination on cloud %s because of reason: %s", instanceId, this, reason);
instanceIdsToTerminate.put(instanceId, reason);
fine("InstanceIdsToTerminate: %s", instanceIdsToTerminate);
return true;
}
Expand Down Expand Up @@ -763,7 +766,7 @@ private void init() {
id = new LazyUuid();

plannedNodesCache = new HashSet<>();
instanceIdsToTerminate = new HashSet<>();
instanceIdsToTerminate = new HashMap<>();
plannedNodeScheduledFutures = new ArrayList<>();
}

Expand Down
33 changes: 18 additions & 15 deletions src/main/java/com/amazon/jenkins/ec2fleet/EC2FleetLabelCloud.java
Original file line number Diff line number Diff line change
Expand Up @@ -354,13 +354,13 @@ private static class State {
int toAdd;
final Set<NodeProvisioner.PlannedNode> plannedNodes;
final Set<NodeProvisioner.PlannedNode> plannedNodesToRemove;
final Set<String> instanceIdsToTerminate;
final Map<String, EC2AgentTerminationReason> instanceIdsToTerminate;

public State(String fleetId) {
this.fleetId = fleetId;
this.plannedNodes = new HashSet<>();
this.plannedNodesToRemove = new HashSet<>();
this.instanceIdsToTerminate = new HashSet<>();
this.instanceIdsToTerminate = new HashMap<>();
}

public State(State state) {
Expand All @@ -370,7 +370,7 @@ public State(State state) {
this.targetCapacity = state.targetCapacity;
this.toAdd = state.toAdd;
this.plannedNodesToRemove = new HashSet<>(state.plannedNodesToRemove);
this.instanceIdsToTerminate = new HashSet<>(state.instanceIdsToTerminate);
this.instanceIdsToTerminate = new HashMap<>(state.instanceIdsToTerminate);
}
}

Expand Down Expand Up @@ -405,7 +405,7 @@ public void update() {
final State state = states.get(entry.getKey());

state.stats = entry.getValue().stats;
state.instanceIdsToTerminate.removeAll(entry.getValue().instanceIdsToTerminate);
state.instanceIdsToTerminate.keySet().removeAll(entry.getValue().instanceIdsToTerminate.keySet());
// toAdd only grow outside of this method, so we can subtract
state.toAdd = state.toAdd - entry.getValue().toAdd;
// remove released planned nodes
Expand Down Expand Up @@ -438,9 +438,9 @@ private void updateByState(final Map<String, State> states) {
}
}

final List<String> instanceIdsToRemove = new ArrayList<>();
final Map<String, EC2AgentTerminationReason> instanceIdsToRemove = new HashMap<>();
for (State state : states.values()) {
instanceIdsToRemove.addAll(state.instanceIdsToTerminate);
instanceIdsToRemove.putAll(state.instanceIdsToTerminate);
}

if (instanceIdsToRemove.size() > 0) {
Expand All @@ -450,21 +450,22 @@ private void updateByState(final Map<String, State> states) {
Queue.withLock(new Runnable() {
@Override
public void run() {
for (final String instanceId : instanceIdsToRemove) {
info("Removing Jenkins nodes before terminating corresponding EC2 instances");
for (final String instanceId : instanceIdsToRemove.keySet()) {
final Node node = jenkins.getNode(instanceId);
if (node != null) {
try {
jenkins.removeNode(node);
} catch (IOException e) {
warning("unable remove node %s from Jenkins, skip, just terminate EC2 instance", instanceId);
warning("unable to remove node %s from Jenkins, skip, just terminate EC2 instance", instanceId);
}
}
}
}
});
info("Delete terminating nodes from Jenkins %s", instanceIdsToRemove);

Registry.getEc2Api().terminateInstances(ec2, instanceIdsToRemove);
Registry.getEc2Api().terminateInstances(ec2, instanceIdsToRemove.keySet());
info("Instances %s were terminated with result", instanceIdsToRemove);
}

Expand Down Expand Up @@ -562,7 +563,9 @@ public void run() {
}
}

public synchronized boolean scheduleToTerminate(final String instanceId, boolean force) {
@Override
public synchronized boolean scheduleToTerminate(final String instanceId, final boolean ignoreMinConstraints,
final EC2AgentTerminationReason terminationReason) {
info("Attempting to terminate instance: %s", instanceId);

final Node node = Jenkins.get().getNode(instanceId);
Expand All @@ -574,19 +577,19 @@ public synchronized boolean scheduleToTerminate(final String instanceId, boolean
return false;
}

// We can't remove instances beyond minSize unless force is true
// We can't remove instances beyond minSize unless ignoreMinConstraints is true
final EC2FleetLabelParameters parameters = new EC2FleetLabelParameters(node.getLabelString());
final int minSize = parameters.getIntOrDefault("minSize", this.minSize);
if (!force && (minSize > 0 && state.stats.getNumDesired() - state.instanceIdsToTerminate.size() <= minSize)) {
if (!ignoreMinConstraints && (minSize > 0 && state.stats.getNumDesired() - state.instanceIdsToTerminate.size() <= minSize)) {
info("Not terminating %s because we need a minimum of %s instances running.", instanceId, minSize);
return false;
}
info("Scheduling instance '%s' for termination on cloud %s with force: %b", instanceId, this, force);
state.instanceIdsToTerminate.add(instanceId);
info("Scheduling instance '%s' for termination on cloud %s because of reason: %s", instanceId, this, terminationReason);
state.instanceIdsToTerminate.put(instanceId, terminationReason);
return true;
}

// sync as we are using modifyable state
// sync as we are using modifiable state
@Override
public synchronized boolean canProvision(final Cloud.CloudState cloudState) {
final Label label = cloudState.getLabel();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public HttpResponse doDoDelete() throws IOException {
final String instanceId = node.getNodeName();
final AbstractEC2FleetCloud cloud = node.getCloud();
if (cloud != null && StringUtils.isNotBlank(instanceId)) {
cloud.scheduleToTerminate(instanceId, false);
cloud.scheduleToTerminate(instanceId, false, EC2AgentTerminationReason.AGENT_DELETED);
}
}
return super.doDoDelete();
Expand Down
Loading

0 comments on commit 9450fa8

Please sign in to comment.