Skip to content

Commit

Permalink
Call StepExecution.onResume in parallel to the extent possible
Browse files Browse the repository at this point in the history
  • Loading branch information
jglick committed May 19, 2022
1 parent 8fba087 commit b1778a9
Showing 1 changed file with 103 additions and 23 deletions.
126 changes: 103 additions & 23 deletions src/main/java/org/jenkinsci/plugins/workflow/flow/FlowExecutionList.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,20 @@
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jenkinsci.plugins.workflow.graph.FlowNode;
import org.jenkinsci.plugins.workflow.graphanalysis.LinearBlockHoppingScanner;

import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.Beta;
Expand Down Expand Up @@ -259,29 +266,19 @@ public void onResumed(@NonNull FlowExecution e) {
Futures.addCallback(e.getCurrentExecutions(false), new FutureCallback<List<StepExecution>>() {
@Override
public void onSuccess(@NonNull List<StepExecution> result) {
try {
if (e.isComplete()) {
// WorkflowRun.onLoad will not fireResumed if the execution was already complete when loaded,
// and CpsFlowExecution should not then complete until afterStepExecutionsResumed, so this is defensive.
return;
}
FlowExecutionList list = FlowExecutionList.get();
FlowExecutionOwner owner = e.getOwner();
if (!list.runningTasks.contains(owner)) {
LOGGER.log(Level.WARNING, "Resuming {0}, which is missing from FlowExecutionList ({1}), so registering it now.", new Object[] {owner, list.runningTasks.getView()});
list.register(owner);
}
LOGGER.log(Level.FINE, "Will resume {0}", result);
for (StepExecution se : result) {
try {
se.onResume();
} catch (Throwable x) {
se.getContext().onFailure(x);
}
}
} finally {
e.afterStepExecutionsResumed();
if (e.isComplete()) {
// WorkflowRun.onLoad will not fireResumed if the execution was already complete when loaded,
// and CpsFlowExecution should not then complete until afterStepExecutionsResumed, so this is defensive.
return;
}
FlowExecutionList list = FlowExecutionList.get();
FlowExecutionOwner owner = e.getOwner();
if (!list.runningTasks.contains(owner)) {
LOGGER.log(Level.WARNING, "Resuming {0}, which is missing from FlowExecutionList ({1}), so registering it now.", new Object[] {owner, list.runningTasks.getView()});
list.register(owner);
}
LOGGER.log(Level.FINE, "Will resume {0}", result);
new ParallelResumer(result, e::afterStepExecutionsResumed).run();
}

@Override
Expand All @@ -294,7 +291,90 @@ public void onFailure(@NonNull Throwable t) {
e.afterStepExecutionsResumed();
}

}, Timer.get()); // We always hold RunMap and WorkflowRun locks here, so we resume steps on a different thread to avoid potential deadlocks. See JENKINS-67351.
}, MoreExecutors.directExecutor());
}
}

/** Calls {@link StepExecution#onResume} for each step in a running build.
* Does so in parallel, but always completing enclosing blocks before the enclosed step.
* A simplified version of https://stackoverflow.com/a/67449067/12916, since this should be a tree not a general DAG.
*/
private static final class ParallelResumer {

private final Runnable onCompletion;
/** Step nodes mapped to the step execution. Entries removed when they are ready to be resumed. */
private final Map<FlowNode, StepExecution> nodes = new HashMap<>();
/** Step nodes currently being resumed. Removed after resumption completes. */
private final Set<FlowNode> processing = new HashSet<>();
/** Step nodes mapped to the nearest enclosing step node (no entry if at root). */
private final Map<FlowNode, FlowNode> enclosing = new HashMap<>();

ParallelResumer(Collection<StepExecution> executions, Runnable onCompletion) {
this.onCompletion = onCompletion;
// First look up positions in the flow graph, so that we can compute dependencies:
for (StepExecution se : executions) {
try {
FlowNode n = se.getContext().get(FlowNode.class);
if (n != null) {
nodes.put(n, se);
} else {
LOGGER.warning(() -> "Could not find FlowNode for " + se + " so it will not be resumed");
}
} catch (IOException | InterruptedException x) {
LOGGER.log(Level.WARNING, "Could not look up FlowNode for " + se + " so it will not be resumed", x);
}
}
for (FlowNode n : nodes.keySet()) {
LinearBlockHoppingScanner scanner = new LinearBlockHoppingScanner();
scanner.setup(n);
for (FlowNode parent : scanner) {
if (parent != n && nodes.containsKey(parent)) {
enclosing.put(n, parent);
break;
}
}
}
}

synchronized void run() {
LOGGER.fine(() -> "Checking status with nodes=" + nodes + " enclosing=" + enclosing + " processing=" + processing);
if (nodes.isEmpty()) {
if (processing.isEmpty()) {
LOGGER.fine("Done");
onCompletion.run();
}
return;
}
Map<FlowNode, StepExecution> ready = new HashMap<>();
for (Map.Entry<FlowNode, StepExecution> entry : nodes.entrySet()) {
FlowNode n = entry.getKey();
FlowNode parent = enclosing.get(n);
if (parent == null || !nodes.containsKey(parent)) {
ready.put(n, entry.getValue());
}
}
LOGGER.fine(() -> "Ready to resume: " + ready);
nodes.keySet().removeAll(ready.keySet());
for (Map.Entry<FlowNode, StepExecution> entry : ready.entrySet()) {
FlowNode n = entry.getKey();
StepExecution exec = entry.getValue();
processing.add(n);
Timer.get().submit(() -> {
LOGGER.fine(() -> "About to resume " + n + " ~ " + exec);
try {
exec.onResume();
} catch (Throwable x) {
exec.getContext().onFailure(x);
}
LOGGER.fine(() -> "Finished resuming " + n + " ~ " + exec);
synchronized (ParallelResumer.this) {
processing.remove(n);
run();
}
});
}
}

}

}

0 comments on commit b1778a9

Please sign in to comment.