Skip to content

Commit

Permalink
Avoid infinite loops due to corrupted flow graphs in StandardGraphLoo…
Browse files Browse the repository at this point in the history
…kupView and LinearBlockHoppingScanner and improve resumption error handling
  • Loading branch information
dwnusbaum committed Aug 8, 2024
1 parent 27d1ad1 commit 7b934f1
Show file tree
Hide file tree
Showing 21 changed files with 317 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -362,20 +362,31 @@ private static final class ParallelResumer {
nodes.put(n, se);
} else {
LOGGER.warning(() -> "Could not find FlowNode for " + se + " so it will not be resumed");
// TODO: Should we call se.getContext().onFailure()?

Check warning on line 365 in src/main/java/org/jenkinsci/plugins/workflow/flow/FlowExecutionList.java

View check run for this annotation

ci.jenkins.io / Open Tasks Scanner

TODO

NORMAL: Should we call se.getContext().onFailure()?
}
} catch (IOException | InterruptedException x) {
LOGGER.log(Level.WARNING, "Could not look up FlowNode for " + se + " so it will not be resumed", x);
// TODO: Should we call se.getContext().onFailure()?

Check warning on line 369 in src/main/java/org/jenkinsci/plugins/workflow/flow/FlowExecutionList.java

View check run for this annotation

ci.jenkins.io / Open Tasks Scanner

TODO

NORMAL: Should we call se.getContext().onFailure()?
}
}
for (FlowNode n : nodes.keySet()) {
LinearBlockHoppingScanner scanner = new LinearBlockHoppingScanner();
scanner.setup(n);
for (FlowNode parent : scanner) {
if (parent != n && nodes.containsKey(parent)) {
enclosing.put(n, parent);
break;
try {
for (FlowNode n : nodes.keySet()) {
LinearBlockHoppingScanner scanner = new LinearBlockHoppingScanner();
scanner.setup(n);
for (FlowNode parent : scanner) {
if (parent != n && nodes.containsKey(parent)) {
enclosing.put(n, parent);
break;
}
}
}
} catch (Exception e) {
// TODO: Should we instead just try to resume steps with no respect to topological order?

Check warning on line 384 in src/main/java/org/jenkinsci/plugins/workflow/flow/FlowExecutionList.java

View check run for this annotation

ci.jenkins.io / Open Tasks Scanner

TODO

NORMAL: Should we instead just try to resume steps with no respect to topological order?
// There is something wrong with the FlowGraph. Kill all of the steps so the build fails instead of hanging forever.
for (StepExecution se : executions) {
se.getContext().onFailure(e);
}
onCompletion.run();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

/**
Expand Down Expand Up @@ -119,8 +121,11 @@ BlockEndNode bruteForceScanForEnd(@NonNull BlockStartNode start) {
/** Do a brute-force scan for the enclosing blocks **/
BlockStartNode bruteForceScanForEnclosingBlock(@NonNull final FlowNode node) {
FlowNode current = node;

Set<String> visited = new HashSet<>();
while (!(current instanceof FlowStartNode)) { // Hunt back for enclosing blocks, a potentially expensive operation
if (!visited.add(current.getId())) {

Check warning on line 126 in src/main/java/org/jenkinsci/plugins/workflow/graph/StandardGraphLookupView.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Partially covered line

Line 126 is only partially covered, one branch is missing
throw new IllegalStateException("Loop in flow graph for " + node.getExecution() + " involving " + current);

Check warning on line 127 in src/main/java/org/jenkinsci/plugins/workflow/graph/StandardGraphLookupView.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered line

Line 127 is not covered by tests
}
if (current instanceof BlockEndNode) {
// Hop over the block to the start
BlockStartNode start = ((BlockEndNode) current).getStartNode();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@
import edu.umd.cs.findbugs.annotations.NonNull;
import net.jcip.annotations.NotThreadSafe;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand Down Expand Up @@ -85,9 +87,12 @@ protected void setHeads(@NonNull Collection<FlowNode> heads) {
@CheckForNull
protected FlowNode jumpBlockScan(@CheckForNull FlowNode node, @NonNull Collection<FlowNode> blacklistNodes) {
FlowNode candidate = node;

Set<String> visited = new HashSet<>();
// Find the first candidate node preceding a block... and filtering by blacklist
while (candidate instanceof BlockEndNode) {
if (!visited.add(candidate.getId())) {
throw new IllegalStateException("Loop in flow graph for " + candidate.getExecution() + " involving " + candidate);
}
candidate = ((BlockEndNode) candidate).getStartNode();
if (blacklistNodes.contains(candidate)) {
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
import org.jvnet.hudson.test.LoggerRule;
import org.jvnet.hudson.test.JenkinsSessionRule;
import org.jvnet.hudson.test.TestExtension;
import org.jvnet.hudson.test.recipes.LocalData;
import org.kohsuke.stapler.DataBoundConstructor;

public class FlowExecutionListTest {
Expand Down Expand Up @@ -160,6 +161,27 @@ public class FlowExecutionListTest {
});
}

@LocalData
@Test public void resumeStepExecutionsWithCorruptFlowGraphWithLoop() throws Throwable {
// LocalData created using the following snippet while the build was waiting in the _second_ sleep, except
// for build.xml, which was captured during the sleep step. The StepEndNode for the stage was then adjusted to
// have its startId point to the timeout step's StepStartNode, creating a loop.
/*
sessions.then(r -> {
var stuck = r.createProject(WorkflowJob.class);
stuck.setDefinition(new CpsFlowDefinition("stage('stage') { sleep 30 }; timeout(time: 10) { sleep 30 }", true));
var b = stuck.scheduleBuild2(0).waitForStart();
System.out.println(b.getRootDir());
r.waitForCompletion(b);
});
*/
sessions.then(r -> {
var p = r.jenkins.getItemByFullName("test0", WorkflowJob.class);
var b = p.getBuildByNumber(1);
r.assertBuildStatus(Result.FAILURE, r.waitForCompletion(b));
});
}

public static class NonResumableStep extends Step implements Serializable {
public static final long serialVersionUID = 1L;
@DataBoundConstructor
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?xml version='1.1' encoding='UTF-8'?>
<flow-build plugin="[email protected]_ec82f">
<queueId>1</queueId>
<timestamp>1722979974600</timestamp>
<startTime>1722979974615</startTime>
<duration>0</duration>
<charset>UTF-8</charset>
<keepLog>false</keepLog>
<execution class="org.jenkinsci.plugins.workflow.cps.CpsFlowExecution">
<result>SUCCESS</result>
<script>stage(&apos;stage&apos;) { sleep 30 }; timeout(time: 10) { sleep 30 }</script>
<loadedScripts class="linked-hash-map"/>
<durabilityHint>MAX_SURVIVABILITY</durabilityHint>
<timings class="map">
<entry>
<string>flowNode</string>
<long>101709541</long>
</entry>
<entry>
<string>classLoad</string>
<long>124446251</long>
</entry>
<entry>
<string>run</string>
<long>200459289</long>
</entry>
<entry>
<string>parse</string>
<long>166818958</long>
</entry>
<entry>
<string>saveProgram</string>
<long>57936125</long>
</entry>
</timings>
<internalCalls class="sorted-set"/>
<sandbox>true</sandbox>
<iota>5</iota>
<head>1:5</head>
<start>2</start>
<done>false</done>
<resumeBlocked>false</resumeBlocked>
</execution>
<completed>false</completed>
<checkouts class="hudson.util.PersistedList"/>
</flow-build>
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Started
ha:////4LSn6sjy3SQJvLr4M0Jcw5zYiu9jzzxAhCQhf5ciKGcGAAAAoh+LCAAAAAAAAP9tjTEOwjAQBM8BClpKHuFItIiK1krDC0x8GCfWnbEdkooX8TX+gCESFVvtrLSa5wtWKcKBo5UdUu8otU4GP9jS5Mixv3geZcdn2TIl9igbHBs2eJyx4YwwR1SwULBGaj0nRzbDRnX6rmuvydanHMu2V1A5c4MHCFXMWcf8hSnC9jqYxPTz/BXAFEIGsfuclm8zQVqFvQAAAA==[Pipeline] Start of Pipeline
ha:////4FLPcBhXXN+T+Uy5Bq+9NjiuG45smS/CK+MQ8sUKcTBsAAAApR+LCAAAAAAAAP9tjTEOwjAUQ3+KOrAycohUghExsUZZOEFIQkgb/d8mKe3EibgadyBQiQlLlmxL1nu+oE4RjhQdby12HpP2vA+jK4lPFLtroIm3dOGaMFGwXNpJkrGnpUrKFhaxClYC1hZ1oOTRZdiIVt1VExS65pxj2Q4CKm8GeAAThZxVzN8yR9jeRpMIf5y/AJj7DGxXvP/86jduZBmjwAAAAA==[Pipeline] stage
ha:////4MPOLP4Go7JuW3NkAKhksIfyE+NroTrcISNM8xfRFLQ8AAAApR+LCAAAAAAAAP9tjTEOwjAUQ3+KOrAycoh0gA0xsUZZOEFIQkgb/d8mKe3EibgadyBQiQlLlmxL1nu+oE4RjhQdby12HpP2vA+jK4lPFLtroIm3dOGaMFGwXNpJkrGnpUrKFhaxClYC1hZ1oOTRZdiIVt1VExS65pxj2Q4CKm8GeAAThZxVzN8yR9jeRpMIf5y/AJj7DGxXvP/86jfoP95RwAAAAA==[Pipeline] { (stage)
ha:////4E53KhegWm+s/q0TJkIC5MI9kTq62Eqnzz2Qdi1URRTJAAAAoh+LCAAAAAAAAP9tjTEOAiEURD9rLGwtPQTbaGWsbAmNJ0AWEZb8zwLrbuWJvJp3kLiJlZNMMm+a93rDOic4UbLcG+wdZu14DKOti0+U+lugiXu6ck2YKRguzSSpM+cFJRUDS1gDKwEbgzpQdmgLbIVXD9UGhba9lFS/o4DGdQM8gYlqLiqVL8wJdvexy4Q/z18BzLEA29ce4gdpL1fxvAAAAA==[Pipeline] sleep
Sleeping for 30 sec
ha:////4G8hLCAAqKEvMe92YhTNPJa4MSOZpWK2lhgTDgEHbCXUAAAAoh+LCAAAAAAAAP9tjTEOAiEURD9rLGwtPQTbGBtjZUtoPAGyiLDkfxZYdytP5NW8g8RNrJxkknnTvNcb1jnBiZLl3mDvMGvHYxhtXXyi1N8CTdzTlWvCTMFwaSZJnTkvKKkYWMIaWAnYGNSBskNbYCu8eqg2KLTtpaT6HQU0rhvgCUxUc1GpfGFOsLuPXSb8ef4KYI6xADvU7j8OXFZ7vAAAAA==[Pipeline] }
ha:////4AnScT3OQumBbV+luAyxvhEcCl/8MozDCcq/aC6iNLpjAAAAox+LCAAAAAAAAP9tjTEOAiEURD9rLGwtPQRbWFgYK1tC4wmQRYQl/7PAult5Iq/mHSRuYuUkk8yb5r3esM4JTpQs9wZ7h1k7HsNo6+ITpf4WaOKerlwTZgqGSzNJ6sx5QUnFwBLWwErAxqAOlB3aAlvh1UO1QaFtLyXV7yigcd0AT2CimotK5Qtzgt197DLhz/NXAHOMBdihdv8BHeBS2LwAAAA=[Pipeline] // stage
ha:////4PTj6M4JscP6Gdk49EfTAaLMCwLZYd9IOq+brFvOiJPAAAAAph+LCAAAAAAAAP9tjUEKwjAURH8qXbh16SFScCWIK7chG08QkxjThv/bJLVdeSKv5h2MFlw5MDAzMLznC+oU4UjR8dZi5zFpz/swupL4RLG7Bpp4SxeuCRMFy6WdJBl7WqqkbGERq2AlYG1RB0oeXYaNaNVdNUGha845lu0goPJmgAcwUchZxfwtc4TtbTSJ8Mf5C4C5z8B2xfvPr34DrZTeycAAAAA=[Pipeline] timeout
Timeout set to expire in 10 min
ha:////4M9FYx/jFzgoF1Ji4m6uzCtxEvJBQzBzYoKBBTbKepUTAAAApR+LCAAAAAAAAP9tjTEOwjAUQ3+KOrAycoh0BSEm1igLJwhJCGmj/9skpZ04EVfjDgQqMWHJkm3Jes8X1CnCkaLjrcXOY9Ke92F0JfGJYncNNPGWLlwTJgqWSztJMva0VEnZwiJWwUrA2qIOlDy6DBvRqrtqgkLXnHMs20FA5c0AD2CikLOK+VvmCNvbaBLhj/MXAHOfge2K959f/QbB16AVwAAAAA==[Pipeline] {
ha:////4O/MG/IybiYM4oG30m877qNjUwTyRLwWY87qTVAOsZwoAAAAoh+LCAAAAAAAAP9tjTEOwjAQBDdBFLSUPMKBEiEqWisNLzCJMU6su2BfSCpexNf4AxGRqNhqZ5p5vbFMEUeOTjWWWk+p8qoLvZueGji218CDaviiKqbEwarSDiXX9jRjyWIxL8ux0FhZqgInT06w1o15mCIYcsVZ4uQOGrmv73gi01NZTJQvjBGbW18npl/nbwBjJ8j2gny37T6VOYoyvQAAAA==[Pipeline] sleep
Sleeping for 30 sec
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1232 5
1252
2157 8
2189
2789 10
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version='1.1' encoding='UTF-8'?>
<Tag plugin="[email protected]_ed8a_573">
<node class="cps.n.StepAtomNode" plugin="[email protected]_2c931e7935">
<parentIds>
<string>9</string>
</parentIds>
<id>10</id>
<descriptorId>org.jenkinsci.plugins.workflow.steps.SleepStep</descriptorId>
</node>
<actions>
<cps.a.ArgumentsActionImpl plugin="[email protected]_2c931e7935">
<arguments>
<entry>
<string>time</string>
<long>30</long>
</entry>
</arguments>
<sensitiveVariables/>
<isUnmodifiedBySanitization>true</isUnmodifiedBySanitization>
</cps.a.ArgumentsActionImpl>
<wf.a.TimingAction>
<startTime>1722980005296</startTime>
</wf.a.TimingAction>
<s.a.LogStorageAction/>
</actions>
</Tag>
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<?xml version='1.1' encoding='UTF-8'?>
<Tag plugin="[email protected]_ed8a_573">
<node class="org.jenkinsci.plugins.workflow.graph.FlowStartNode">
<parentIds/>
<id>2</id>
</node>
<actions>
<wf.a.TimingAction>
<startTime>1722979974868</startTime>
</wf.a.TimingAction>
</actions>
</Tag>
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version='1.1' encoding='UTF-8'?>
<Tag plugin="[email protected]_ed8a_573">
<node class="cps.n.StepStartNode" plugin="[email protected]_2c931e7935">
<parentIds>
<string>2</string>
</parentIds>
<id>3</id>
<descriptorId>org.jenkinsci.plugins.workflow.support.steps.StageStep</descriptorId>
</node>
<actions>
<s.a.LogStorageAction/>
<cps.a.ArgumentsActionImpl plugin="[email protected]_2c931e7935">
<arguments>
<entry>
<string>name</string>
<string>stage</string>
</entry>
</arguments>
<sensitiveVariables/>
<isUnmodifiedBySanitization>true</isUnmodifiedBySanitization>
</cps.a.ArgumentsActionImpl>
<wf.a.TimingAction>
<startTime>1722979974972</startTime>
</wf.a.TimingAction>
</actions>
</Tag>
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?xml version='1.1' encoding='UTF-8'?>
<Tag plugin="[email protected]_ed8a_573">
<node class="cps.n.StepStartNode" plugin="[email protected]_2c931e7935">
<parentIds>
<string>3</string>
</parentIds>
<id>4</id>
<descriptorId>org.jenkinsci.plugins.workflow.support.steps.StageStep</descriptorId>
</node>
<actions>
<wf.a.BodyInvocationAction/>
<wf.a.LabelAction>
<displayName>stage</displayName>
</wf.a.LabelAction>
<wf.a.TimingAction>
<startTime>1722979974988</startTime>
</wf.a.TimingAction>
</actions>
</Tag>
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version='1.1' encoding='UTF-8'?>
<Tag plugin="[email protected]_ed8a_573">
<node class="cps.n.StepAtomNode" plugin="[email protected]_2c931e7935">
<parentIds>
<string>4</string>
</parentIds>
<id>5</id>
<descriptorId>org.jenkinsci.plugins.workflow.steps.SleepStep</descriptorId>
</node>
<actions>
<cps.a.ArgumentsActionImpl plugin="[email protected]_2c931e7935">
<arguments>
<entry>
<string>time</string>
<long>30</long>
</entry>
</arguments>
<sensitiveVariables/>
<isUnmodifiedBySanitization>true</isUnmodifiedBySanitization>
</cps.a.ArgumentsActionImpl>
<wf.a.TimingAction>
<startTime>1722979975077</startTime>
</wf.a.TimingAction>
<s.a.LogStorageAction/>
</actions>
</Tag>
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version='1.1' encoding='UTF-8'?>
<Tag plugin="[email protected]_ed8a_573">
<node class="cps.n.StepEndNode" plugin="[email protected]_2c931e7935">
<parentIds>
<string>5</string>
</parentIds>
<id>6</id>
<startId>4</startId>
</node>
<actions>
<wf.a.BodyInvocationAction/>
<wf.a.TimingAction>
<startTime>1722980005117</startTime>
</wf.a.TimingAction>
</actions>
</Tag>
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?xml version='1.1' encoding='UTF-8'?>
<Tag plugin="[email protected]_ed8a_573">
<node class="cps.n.StepEndNode" plugin="[email protected]_2c931e7935">
<parentIds>
<string>6</string>
</parentIds>
<id>7</id>
<startId>8</startId> <!-- Manually edited to simulate corruption observed in the real world. -->
</node>
<actions>
<wf.a.TimingAction>
<startTime>1722980005177</startTime>
</wf.a.TimingAction>
</actions>
</Tag>
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version='1.1' encoding='UTF-8'?>
<Tag plugin="[email protected]_ed8a_573">
<node class="cps.n.StepStartNode" plugin="[email protected]_2c931e7935">
<parentIds>
<string>7</string>
</parentIds>
<id>8</id>
<descriptorId>org.jenkinsci.plugins.workflow.steps.TimeoutStep</descriptorId>
</node>
<actions>
<s.a.LogStorageAction/>
<cps.a.ArgumentsActionImpl plugin="[email protected]_2c931e7935">
<arguments>
<entry>
<string>time</string>
<int>10</int>
</entry>
</arguments>
<sensitiveVariables/>
<isUnmodifiedBySanitization>true</isUnmodifiedBySanitization>
</cps.a.ArgumentsActionImpl>
<wf.a.TimingAction>
<startTime>1722980005227</startTime>
</wf.a.TimingAction>
</actions>
</Tag>
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version='1.1' encoding='UTF-8'?>
<Tag plugin="[email protected]_ed8a_573">
<node class="cps.n.StepStartNode" plugin="[email protected]_2c931e7935">
<parentIds>
<string>8</string>
</parentIds>
<id>9</id>
<descriptorId>org.jenkinsci.plugins.workflow.steps.TimeoutStep</descriptorId>
</node>
<actions>
<wf.a.BodyInvocationAction/>
<wf.a.TimingAction>
<startTime>1722980005244</startTime>
</wf.a.TimingAction>
</actions>
</Tag>
Loading

0 comments on commit 7b934f1

Please sign in to comment.