Skip to content

Commit

Permalink
Fix 322 and 363 (#376)
Browse files Browse the repository at this point in the history
* [fix] Terminate scheduled instances ONLY IF idle

#363

* [fix] leave maxTotalUses alone and track remainingUses correctly

add a flag to track termination of agents by plugin

* [fix] Fix lost state (instanceIdsToTerminate) on configuration change

[fix] Fix maxtotaluses decrement logic

add logs in post job action to expose tasks terminated with problems

#322

add and fix tests

* add integration tests for configuration change leading to lost state and rebuilding lost state to terminate instances previously marked for termination
  • Loading branch information
pdk27 authored Jun 28, 2023
1 parent 9450fa8 commit 46d6731
Show file tree
Hide file tree
Showing 10 changed files with 512 additions and 197 deletions.
37 changes: 30 additions & 7 deletions src/main/java/com/amazon/jenkins/ec2fleet/EC2FleetCloud.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import com.amazonaws.services.ec2.model.Instance;
import com.amazonaws.services.ec2.model.InstanceStateName;
import com.cloudbees.jenkins.plugins.awscredentials.AWSCredentialsHelper;
import com.google.common.collect.Sets;
import hudson.Extension;
import hudson.model.Computer;
import hudson.model.Descriptor;
Expand Down Expand Up @@ -51,6 +52,7 @@
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;
import java.util.stream.Collectors;

/**
* @see CloudNanny
Expand Down Expand Up @@ -212,7 +214,7 @@ public EC2FleetCloud(final String name,
this.minSize = Math.max(0, minSize);
this.maxSize = maxSize;
this.minSpareSize = Math.max(0, minSpareSize);
this.maxTotalUses = StringUtils.isBlank(maxTotalUses) ? -1 : Integer.parseInt(maxTotalUses);
this.maxTotalUses = StringUtils.isBlank(maxTotalUses) ? DEFAULT_MAX_TOTAL_USES : Integer.parseInt(maxTotalUses);
this.numExecutors = Math.max(numExecutors, 1);
this.addNodeOnlyIfRunning = addNodeOnlyIfRunning;
this.restrictUsage = restrictUsage;
Expand Down Expand Up @@ -284,10 +286,6 @@ public String getEndpoint() {
return endpoint;
}

public int getMaxTotalUses() {
return maxTotalUses == null ? DEFAULT_MAX_TOTAL_USES : maxTotalUses;
}

@Override
public String getFleet() {
return fleet;
Expand Down Expand Up @@ -382,6 +380,11 @@ synchronized void setStats(final FleetStateStats stats) {
this.stats = stats;
}

// make maxTotalUses inaccessible from cloud for safety. Use {@link EC2FleetNode#maxTotalUses} and {@link EC2FleetNode#usesRemaining} instead.
public boolean hasUnlimitedUsesForNodes() {
return maxTotalUses == -1;
}

@Override
public synchronized boolean hasExcessCapacity() {
if(stats == null) {
Expand Down Expand Up @@ -502,7 +505,9 @@ public FleetStateStats update() {
}
}
currentToAdd = toAdd;
currentInstanceIdsToTerminate = new HashMap<>(instanceIdsToTerminate);

// for computers currently busy doing work, wait until next update cycle to terminate corresponding instances (issue#363).
currentInstanceIdsToTerminate = filterOutBusyNodes();
}

currentState = updateByState(currentToAdd, currentInstanceIdsToTerminate, currentState);
Expand Down Expand Up @@ -533,6 +538,24 @@ public FleetStateStats update() {
}
}

private Map<String, EC2AgentTerminationReason> filterOutBusyNodes() {
final Jenkins j = Jenkins.get();
final Map<String, EC2AgentTerminationReason> filteredInstanceIdsToTerminate = instanceIdsToTerminate.entrySet()
.stream()
.filter(e -> {
final Computer c = j.getComputer(e.getKey());
return c == null || c.isIdle();
})
.collect(Collectors.toMap(Map.Entry::getKey,Map.Entry::getValue));

final Set<String> filteredOutNonIdleIds = Sets.difference(instanceIdsToTerminate.keySet(), filteredInstanceIdsToTerminate.keySet());
if (filteredOutNonIdleIds.size() > 0) {
info("Skipping termination of the following instances until the next update cycle, as they are still busy doing some work: %s.", filteredOutNonIdleIds);
}

return filteredInstanceIdsToTerminate;
}

public boolean removePlannedNodeScheduledFutures(final int numToRemove) {
if (numToRemove < 1) {
return false;
Expand Down Expand Up @@ -829,7 +852,7 @@ private void addNewSlave(final AmazonEC2 ec2, final Instance instance, FleetStat
final Node.Mode nodeMode = restrictUsage ? Node.Mode.EXCLUSIVE : Node.Mode.NORMAL;
final EC2FleetNode node = new EC2FleetNode(instanceId, "Fleet slave for " + instanceId,
effectiveFsRoot, effectiveNumExecutors, nodeMode, labelString, new ArrayList<NodeProperty<?>>(),
this, computerLauncher, getMaxTotalUses());
this, computerLauncher, maxTotalUses);

// Initialize our retention strategy
node.setRetentionStrategy(new EC2RetentionStrategy());
Expand Down
12 changes: 9 additions & 3 deletions src/main/java/com/amazon/jenkins/ec2fleet/EC2FleetNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
public class EC2FleetNode extends Slave implements EphemeralNode, EC2FleetCloudAware {

private volatile AbstractEC2FleetCloud cloud;
private int maxTotalUses;
private final int maxTotalUses;
private int usesRemaining;

public EC2FleetNode(final String name, final String nodeDescription, final String remoteFS, final int numExecutors, final Mode mode, final String label,
final List<? extends NodeProperty<?>> nodeProperties, final AbstractEC2FleetCloud cloud, ComputerLauncher launcher, final int maxTotalUses) throws IOException, Descriptor.FormException {
Expand All @@ -26,6 +27,7 @@ public EC2FleetNode(final String name, final String nodeDescription, final Strin
launcher, RetentionStrategy.NOOP, nodeProperties);
this.cloud = cloud;
this.maxTotalUses = maxTotalUses;
this.usesRemaining = maxTotalUses;
}

@Override
Expand Down Expand Up @@ -61,8 +63,12 @@ public int getMaxTotalUses() {
return this.maxTotalUses;
}

public void setMaxTotalUses(final int maxTotalUses) {
this.maxTotalUses = maxTotalUses;
public int getUsesRemaining() {
return usesRemaining;
}

public void decrementUsesRemaining() {
this.usesRemaining--;
}

@Extension
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,18 @@
public class EC2FleetNodeComputer extends SlaveComputer implements EC2FleetCloudAware {

private final String name;

private volatile AbstractEC2FleetCloud cloud;
private boolean isMarkedForDeletion;

public EC2FleetNodeComputer(final Slave slave, @Nonnull final String name, @Nonnull final AbstractEC2FleetCloud cloud) {
super(slave);
this.name = name;
this.cloud = cloud;
this.isMarkedForDeletion = false;
}

public boolean isMarkedForDeletion() {
return isMarkedForDeletion;
}

@Override
Expand All @@ -44,9 +49,9 @@ public String getDisplayName() {
final String displayName = String.format("%s %s", cloud.getDisplayName(), name);
final EC2FleetNode node = getNode();
if(node != null) {
final int totalUses = node.getMaxTotalUses();
if(totalUses != -1) {
return String.format("%s Builds left: %d ", displayName, totalUses);
final int usesRemaining = node.getUsesRemaining();
if(usesRemaining != -1) {
return String.format("%s Builds left: %d ", displayName, usesRemaining);
}
}
return displayName;
Expand Down Expand Up @@ -82,6 +87,8 @@ public HttpResponse doDoDelete() throws IOException {
final AbstractEC2FleetCloud cloud = node.getCloud();
if (cloud != null && StringUtils.isNotBlank(instanceId)) {
cloud.scheduleToTerminate(instanceId, false, EC2AgentTerminationReason.AGENT_DELETED);
// Persist a flag here as the cloud objects can be re-created on user-initiated changes, hence, losing track of instance ids scheduled to terminate.
this.isMarkedForDeletion = true;
}
}
return super.doDoDelete();
Expand Down
93 changes: 61 additions & 32 deletions src/main/java/com/amazon/jenkins/ec2fleet/EC2RetentionStrategy.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
*/
public class EC2RetentionStrategy extends RetentionStrategy<SlaveComputer> implements ExecutorListener {

private static final int RE_CHECK_IN_MINUTE = 1;
private static final int RE_CHECK_IN_A_MINUTE = 1;

private static final Logger LOGGER = Logger.getLogger(EC2RetentionStrategy.class.getName());

Expand All @@ -42,52 +42,62 @@ public long check(final SlaveComputer computer) {
if (cloud == null) {
LOGGER.warning("Cloud is null for computer " + fc.getDisplayName()
+ ". This should be autofixed in a few minutes, if not please create an issue for the plugin");
return RE_CHECK_IN_MINUTE;
return RE_CHECK_IN_A_MINUTE;
}

// Ensure that the EC2FleetCloud cannot be mutated from under us while
// we're doing this check
// Ensure nobody provisions onto this node until we've done
// checking
boolean shouldAcceptTasks = fc.isAcceptingTasks();
boolean justTerminated = false;
boolean markedForTermination = false;
fc.setAcceptingTasks(false);
try {
if(fc.isIdle()) {
final EC2AgentTerminationReason reason;
if (isIdleForTooLong(cloud, fc)) {
reason = EC2AgentTerminationReason.IDLE_FOR_TOO_LONG;
Node node = fc.getNode();
if (node == null) {
return RE_CHECK_IN_A_MINUTE;
}

EC2AgentTerminationReason reason;
// Determine the reason for termination from specific to generic use cases.
// Reasoning for checking all cases of termination initiated by the plugin:
// A user-initiated change to cloud configuration creates a new EC2FleetCloud object, erasing class fields containing data like instance IDs to terminate.
// Hence, determine the reasons for termination here using persisted fields for accurate handling of termination.
if (fc.isMarkedForDeletion()) {
reason = EC2AgentTerminationReason.AGENT_DELETED;
} else if (cloud.hasExcessCapacity()) {
reason = EC2AgentTerminationReason.EXCESS_CAPACITY;
} else if (cloud instanceof EC2FleetCloud && !((EC2FleetCloud) cloud).hasUnlimitedUsesForNodes()
&& ((EC2FleetNode)node).getUsesRemaining() <= 0) {
reason = EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED;
} else if (isIdleForTooLong(cloud, fc)) {
reason = EC2AgentTerminationReason.IDLE_FOR_TOO_LONG;
} else {
return 0;
}

// Find instance ID
Node compNode = fc.getNode();
if (compNode == null) {
return 0;
return RE_CHECK_IN_A_MINUTE;
}

final String instanceId = compNode.getNodeName();
if (cloud.scheduleToTerminate(instanceId, false, reason)) {
final String instanceId = node.getNodeName();
final boolean ignoreMinConstraints = reason.equals(EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED);
if (cloud.scheduleToTerminate(instanceId, ignoreMinConstraints, reason)) {
// Instance successfully scheduled for termination, so no longer accept tasks (i.e. suspended)
shouldAcceptTasks = false;
LOGGER.fine(String.format("Suspended node %s after scheduling instance for termination, reason: %s.",
compNode.getDisplayName(), instanceId, reason));
justTerminated = true;
node.getDisplayName(), instanceId, reason));
markedForTermination = true;
}
}

if (cloud.isAlwaysReconnect() && !justTerminated && fc.isOffline() && !fc.isConnecting() && fc.isLaunchSupported()) {
// if connection to the computer is lost for some reason, try to reconnect if configured to do so.
if (cloud.isAlwaysReconnect() && !markedForTermination && fc.isOffline() && !fc.isConnecting() && fc.isLaunchSupported()) {
LOGGER.log(Level.INFO, "Reconnecting to instance: " + fc.getDisplayName());
fc.tryReconnect();
}
} finally {
fc.setAcceptingTasks(shouldAcceptTasks);
}

return RE_CHECK_IN_MINUTE;
return RE_CHECK_IN_A_MINUTE;
}

@Override
Expand Down Expand Up @@ -121,37 +131,56 @@ public void taskAccepted(Executor executor, Queue.Task task) {
final EC2FleetNode ec2FleetNode = computer.getNode();
if (ec2FleetNode != null) {
final int maxTotalUses = ec2FleetNode.getMaxTotalUses();
if (maxTotalUses <= -1) {
LOGGER.fine("maxTotalUses set to unlimited (" + ec2FleetNode.getMaxTotalUses() + ") for agent " + computer.getName());
} else if (maxTotalUses <= 1) {
LOGGER.info("maxTotalUses drained - suspending agent after current build " + computer.getName());
computer.setAcceptingTasks(false);
} else {
ec2FleetNode.setMaxTotalUses(ec2FleetNode.getMaxTotalUses() - 1);
LOGGER.info("Agent " + computer.getName() + " has " + ec2FleetNode.getMaxTotalUses() + " builds left");
if (maxTotalUses <= -1) { // unlimited uses
LOGGER.fine("maxTotalUses set to unlimited (" + maxTotalUses + ") for agent " + computer.getName());
} else { // limited uses
if (ec2FleetNode.getUsesRemaining() > 1) {
ec2FleetNode.decrementUsesRemaining();
LOGGER.info("Agent " + computer.getName() + " has " + ec2FleetNode.getUsesRemaining() + " builds left");
} else if (ec2FleetNode.getUsesRemaining() == 1) { // current task should be the last task for this agent
LOGGER.info(String.format("maxTotalUses drained - suspending agent %s after current build", computer.getName()));
computer.setAcceptingTasks(false);
ec2FleetNode.decrementUsesRemaining();
} else {
// don't decrement when usesRemaining=0, as -1 has a special meaning.
LOGGER.warning(String.format("Agent %s accepted a task after being suspended!!! MaxTotalUses: %d, uses remaining: %d",
computer.getName(), ec2FleetNode.getMaxTotalUses(), ec2FleetNode.getUsesRemaining()));
}
}
}
}
}

@Override
public void taskCompleted(Executor executor, Queue.Task task, long l) {
postJobAction(executor);
postJobAction(executor, null);
}

@Override
public void taskCompletedWithProblems(Executor executor, Queue.Task task, long l, Throwable throwable) {
postJobAction(executor);
postJobAction(executor, throwable);
}

private void postJobAction(Executor executor) {
private void postJobAction(final Executor executor, final Throwable throwable) {
if (throwable != null) {
LOGGER.warning(String.format("Build %s completed with problems on agent %s. TimeSpentInQueue: %ds, duration: %ds, problems: %s",
executor.getCurrentExecutable(), executor.getOwner().getName(),
TimeUnit.MILLISECONDS.toSeconds(executor.getTimeSpentInQueue()),
TimeUnit.MILLISECONDS.toSeconds(executor.getElapsedTime()), throwable.getMessage()));
} else {
LOGGER.info(String.format("Build %s completed successfully on agent %s. TimeSpentInQueue: %ds, duration: %ds.",
executor.getCurrentExecutable(), executor.getOwner().getName(),
TimeUnit.MILLISECONDS.toSeconds(executor.getTimeSpentInQueue()),
TimeUnit.MILLISECONDS.toSeconds(executor.getElapsedTime())));
}

final EC2FleetNodeComputer computer = (EC2FleetNodeComputer) executor.getOwner();
if(computer != null) {
if (computer != null) {
final EC2FleetNode ec2FleetNode = computer.getNode();
if (ec2FleetNode != null) {
final AbstractEC2FleetCloud cloud = ec2FleetNode.getCloud();
if (computer.countBusy() <= 1 && !computer.isAcceptingTasks()) {
LOGGER.info("Calling scheduleToTerminate for node " + ec2FleetNode.getNodeName() + " due to maxTotalUses (" + ec2FleetNode.getMaxTotalUses() + ")");
LOGGER.info("Calling scheduleToTerminate for node " + ec2FleetNode.getNodeName() + " due to exhausted maxTotalUses.");
// Schedule instance for termination even if it breaches minSize and minSpareSize constraints
cloud.scheduleToTerminate(ec2FleetNode.getNodeName(), true, EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED);
}
Expand Down
Loading

0 comments on commit 46d6731

Please sign in to comment.