Skip to content

Commit

Permalink
Merge pull request #104 from jglick/removingAgentIsFatal-JENKINS-49707
Browse files Browse the repository at this point in the history
[JENKINS-49707] (part): Fail the build if an agent has been removed
  • Loading branch information
dwnusbaum authored Jul 2, 2019
2 parents ae58c27 + bd4d976 commit 56d41b7
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import hudson.Util;
import hudson.init.Terminator;
import hudson.model.Node;
import hudson.model.Result;
import hudson.model.TaskListener;
import hudson.remoting.Channel;
import hudson.remoting.ChannelClosedException;
Expand All @@ -58,6 +59,7 @@
import java.util.logging.Logger;
import javax.annotation.CheckForNull;
import javax.annotation.Nonnull;
import jenkins.model.Jenkins;
import jenkins.util.Timer;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
Expand All @@ -68,12 +70,15 @@
import org.jenkinsci.plugins.workflow.actions.LabelAction;
import org.jenkinsci.plugins.workflow.graph.FlowNode;
import org.jenkinsci.plugins.workflow.steps.AbstractStepExecutionImpl;
import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException;
import org.jenkinsci.plugins.workflow.steps.Step;
import org.jenkinsci.plugins.workflow.steps.StepContext;
import org.jenkinsci.plugins.workflow.steps.StepDescriptor;
import org.jenkinsci.plugins.workflow.steps.StepExecution;
import org.jenkinsci.plugins.workflow.support.concurrent.Timeout;
import org.jenkinsci.plugins.workflow.support.concurrent.WithThreadName;
import org.jenkinsci.plugins.workflow.support.pickles.ExecutorPickle;
import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.DoNotUse;
import org.kohsuke.accmod.restrictions.NoExternalUse;
Expand Down Expand Up @@ -280,6 +285,8 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab
private transient boolean awaitingAsynchExit;
/** The first throwable used to stop the task */
private transient volatile Throwable causeOfStoppage;
/** If nonzero, {@link System#nanoTime} when we first discovered that the node had been removed. */
private transient long removedNodeDiscovered;

Execution(StepContext context, DurableTaskStep step) {
super(context);
Expand Down Expand Up @@ -325,13 +332,34 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab
return false;
}

private @CheckForNull FilePath getWorkspace() throws AbortException {
private @CheckForNull FilePath getWorkspace() throws IOException, InterruptedException {
if (ws == null) {
ws = FilePathUtils.find(node, remote);
if (ws == null) {
// Part of JENKINS-49707: check whether an agent has been removed.
// (Note that a Computer may be missing because a Node is offline,
// and conversely after removing a Node its Computer may remain for a while.
// Therefore we only fail here if _both_ are absent.)
// ExecutorStepExecution.RemovedNodeListener will normally do this first, so this is a fallback.
Jenkins j = Jenkins.getInstanceOrNull();
if (ExecutorStepExecution.RemovedNodeCause.ENABLED && !node.isEmpty() && j != null && j.getNode(node) == null) {
if (removedNodeDiscovered == 0) {
LOGGER.fine(() -> "discovered that " + node + " has been removed");
removedNodeDiscovered = System.nanoTime();
return null;
} else if (System.nanoTime() - removedNodeDiscovered < TimeUnit.MILLISECONDS.toNanos(ExecutorPickle.TIMEOUT_WAITING_FOR_NODE_MILLIS)) {
LOGGER.fine(() -> "rediscovering that " + node + " has been removed");
return null;
} else {
LOGGER.fine(() -> node + " has been removed for a while, assuming it is not coming back");
throw new FlowInterruptedException(Result.ABORTED, new ExecutorStepExecution.RemovedNodeCause());
}
}
removedNodeDiscovered = 0; // something else; reset
LOGGER.log(Level.FINE, "Jenkins is not running, no such node {0}, or it is offline", node);
return null;
}
removedNodeDiscovered = 0;
if (watching) {
try {
controller.watch(ws, new HandlerImpl(this, ws, listener()), listener());
Expand Down Expand Up @@ -476,9 +504,11 @@ private void check() {
final FilePath workspace;
try {
workspace = getWorkspace();
} catch (AbortException x) {
} catch (IOException | InterruptedException x) {
recurrencePeriod = 0;
getContext().onFailure(x);
if (causeOfStoppage == null) { // do not doubly terminate
getContext().onFailure(x);
}
return;
}
if (workspace == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
package org.jenkinsci.plugins.workflow.support.pickles;

import com.google.common.util.concurrent.ListenableFuture;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import hudson.AbortException;
import hudson.Extension;
import hudson.Main;
import hudson.model.Executor;
import hudson.model.Node;
import hudson.model.OneOffExecutor;
Expand All @@ -47,6 +49,8 @@
import org.jenkinsci.plugins.workflow.pickles.Pickle;
import org.jenkinsci.plugins.workflow.steps.durable_task.Messages;
import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.NoExternalUse;

/**
* Persists an {@link Executor} as the {@link hudson.model.Queue.Task} it was running.
Expand All @@ -62,7 +66,9 @@ public class ExecutorPickle extends Pickle {

private final boolean isEphemeral;

static long TIMEOUT_WAITING_FOR_NODE_MILLIS = Long.getLong(ExecutorPickle.class.getName()+".timeoutForNodeMillis", TimeUnit.MINUTES.toMillis(5));
@SuppressFBWarnings(value = "MS_SHOULD_BE_FINAL", justification = "deliberately mutable")
@Restricted(NoExternalUse.class)
public static long TIMEOUT_WAITING_FOR_NODE_MILLIS = Main.isUnitTest ? /* fail faster */ TimeUnit.SECONDS.toMillis(15) : Long.getLong(ExecutorPickle.class.getName()+".timeoutForNodeMillis", TimeUnit.MINUTES.toMillis(5));

private ExecutorPickle(Executor executor) {
if (executor instanceof OneOffExecutor) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@
import java.util.logging.Logger;
import javax.annotation.CheckForNull;
import javax.annotation.Nullable;
import jenkins.model.CauseOfInterruption;
import jenkins.model.Jenkins;
import jenkins.model.Jenkins.MasterComputer;
import jenkins.model.NodeListener;
import jenkins.model.queue.AsynchronousExecution;
import jenkins.security.QueueItemAuthenticator;
import jenkins.security.QueueItemAuthenticatorProvider;
Expand All @@ -68,11 +70,14 @@
import org.jenkinsci.plugins.workflow.graph.FlowNode;
import org.jenkinsci.plugins.workflow.graphanalysis.FlowScanningUtils;
import org.jenkinsci.plugins.workflow.steps.AbstractStepExecutionImpl;
import org.jenkinsci.plugins.workflow.steps.BodyExecution;
import org.jenkinsci.plugins.workflow.steps.BodyExecutionCallback;
import org.jenkinsci.plugins.workflow.steps.StepContext;
import org.jenkinsci.plugins.workflow.steps.durable_task.Messages;
import org.jenkinsci.plugins.workflow.support.actions.WorkspaceActionImpl;
import org.jenkinsci.plugins.workflow.support.concurrent.Timeout;
import org.jenkinsci.plugins.workflow.support.pickles.ExecutorPickle;
import org.jenkinsci.plugins.workflow.support.pickles.WorkspaceListLeasePickle;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.DoNotUse;
import org.kohsuke.accmod.restrictions.NoExternalUse;
Expand Down Expand Up @@ -248,6 +253,49 @@ public void stop(Throwable cause) throws Exception {

}

@Extension public static final class RemovedNodeListener extends NodeListener {
@Override protected void onDeleted(Node node) {
if (!RemovedNodeCause.ENABLED) {
return;
}
LOGGER.fine(() -> "received node deletion event on " + node.getNodeName());
Timer.get().schedule(() -> {
Computer c = node.toComputer();
if (c == null || c.isOnline()) {
LOGGER.fine(() -> "computer for " + node.getNodeName() + " was missing or online, skipping");
return;
}
LOGGER.fine(() -> "processing node deletion event on " + node.getNodeName());
for (Executor e : c.getExecutors()) {
Queue.Executable exec = e.getCurrentExecutable();
if (exec instanceof PlaceholderTask.PlaceholderExecutable) {
PlaceholderTask task = ((PlaceholderTask.PlaceholderExecutable) exec).getParent();
TaskListener listener = TaskListener.NULL;
try {
listener = task.context.get(TaskListener.class);
} catch (Exception x) {
LOGGER.log(Level.WARNING, null, x);
}
if (task.body == null) {
listener.getLogger().println("Agent " + node.getNodeName() + " was deleted, but do not have a node body to cancel");
continue;
}
listener.getLogger().println("Agent " + node.getNodeName() + " was deleted; cancelling node body");
task.body.cancel(new RemovedNodeCause());
}
}
}, ExecutorPickle.TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS);
}
}

public static final class RemovedNodeCause extends CauseOfInterruption {
@SuppressFBWarnings(value = "MS_SHOULD_BE_FINAL", justification = "deliberately mutable")
public static boolean ENABLED = Boolean.parseBoolean(System.getProperty(ExecutorStepExecution.class.getName() + ".REMOVED_NODE_DETECTION", "true"));
@Override public String getShortDescription() {
return "Agent was removed";
}
}

/** Transient handle of a running executor task. */
private static final class RunningTask {
/** null until placeholder executable runs */
Expand Down Expand Up @@ -278,6 +326,17 @@ public static final class PlaceholderTask implements ContinuedTask, Serializable
*/
private String cookie;

/**
* Needed for {@link BodyExecution#cancel}.
* {@code transient} because we cannot save a {@link BodyExecution} in {@link PlaceholderTask}:
* {@link ExecutorPickle} is written to the stream first, which holds a {@link PlaceholderTask},
* and the {@link BodyExecution} holds {@link PlaceholderTask.Callback} whose {@link WorkspaceList.Lease}
* is not processed by {@link WorkspaceListLeasePickle} since pickles are not recursive.
* So we make a best effort and only try to cancel a body within the current session.
* @see RemovedNodeListener
*/
private transient @CheckForNull BodyExecution body;

/** {@link Authentication#getName} of user of build, if known. */
private final @CheckForNull String auth;

Expand Down Expand Up @@ -781,7 +840,7 @@ private final class PlaceholderExecutable implements ContinuableExecutable, Acce
flowNode.addAction(new WorkspaceActionImpl(workspace, flowNode));
}
listener.getLogger().println("Running on " + ModelHyperlinkNote.encodeTo(node) + " in " + workspace);
context.newBodyInvoker()
body = context.newBodyInvoker()
.withContexts(exec, computer, env,
FilePathDynamicContext.createContextualObject(workspace))
.withCallback(new Callback(cookie, lease))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import org.apache.commons.lang.StringUtils;

import static org.hamcrest.Matchers.*;
import org.jenkinsci.plugins.durabletask.FileMonitoringTask;

import org.jenkinsci.plugins.workflow.actions.ArgumentsAction;
import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition;
Expand All @@ -77,6 +78,7 @@
import org.jenkinsci.plugins.workflow.steps.StepContext;
import org.jenkinsci.plugins.workflow.steps.StepDescriptor;
import org.jenkinsci.plugins.workflow.steps.StepExecution;
import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution;
import org.jenkinsci.plugins.workflow.support.visualization.table.FlowGraphTable;
import org.jenkinsci.plugins.workflow.support.visualization.table.FlowGraphTable.Row;
import static org.junit.Assert.*;
Expand Down Expand Up @@ -652,6 +654,19 @@ private static final class HelloNote extends ConsoleNote<Object> {
}
}

@Issue("JENKINS-49707")
@Test public void removingAgentIsFatal() throws Exception {
logging.record(DurableTaskStep.class, Level.FINE).record(FileMonitoringTask.class, Level.FINE).record(ExecutorStepExecution.class, Level.FINE);
DumbSlave s = j.createSlave("remote", null, null);
WorkflowJob p = j.jenkins.createProject(WorkflowJob.class, "p");
p.setDefinition(new CpsFlowDefinition("node('remote') {isUnix() ? sh('sleep 1000000') : bat('ping -t 127.0.0.1 > nul')}", true));
WorkflowRun b = p.scheduleBuild2(0).waitForStart();
j.waitForMessage(Functions.isWindows() ? ">ping" : "+ sleep", b);
j.jenkins.removeNode(s);
j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(b));
j.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b);
}

@Issue("JENKINS-44521")
@Test public void shouldInvokeLauncherDecoratorForShellStep() throws Exception {
DumbSlave slave = j.createSlave("slave", null, null);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,15 +191,13 @@ public Node asNode() {
DumbSlave s = r.j.createSlave(Label.get("ghost"));
System.out.println("Agent launched, waiting for semaphore");
SemaphoreStep.waitForStart("wait/1", p.scheduleBuild2(0).waitForStart());
ExecutorPickle.TIMEOUT_WAITING_FOR_NODE_MILLIS = 4000L; // fail faster
r.j.jenkins.removeNode(s);
}
});

r.addStep(new Statement() {
// Start up a build and then reboot and take the node offline
@Override public void evaluate() throws Throwable {
ExecutorPickle.TIMEOUT_WAITING_FOR_NODE_MILLIS = 4000L; // fail faster
assertEquals(0, r.j.jenkins.getLabel("ghost").getNodes().size()); // Make sure test impl is correctly deleted
assertNull(r.j.jenkins.getNode("ghost")); // Make sure test impl is correctly deleted
WorkflowRun run = r.j.jenkins.getItemByFullName("p", WorkflowJob.class).getLastBuild();
Expand Down

0 comments on commit 56d41b7

Please sign in to comment.