Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restart preempted jobs v2 #113

Merged
merged 14 commits into from
Jul 25, 2019
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import hudson.util.FormValidation;
import hudson.util.HttpResponses;
import hudson.util.ListBoxModel;
import hudson.util.StreamTaskListener;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
Expand Down Expand Up @@ -251,7 +250,7 @@ public Collection<PlannedNode> provision(Label label, int excessWorkload) {
// Get next config in round robin fashion
InstanceConfiguration config = configs.get(i % configs.size());

final ComputeEngineInstance node = config.provision(StreamTaskListener.fromStdout());
final ComputeEngineInstance node = config.provision();
Jenkins.get().addNode(node);
result.add(createPlannedNode(config, node));
excessWorkload -= node.getNumExecutors();
Expand Down Expand Up @@ -408,7 +407,7 @@ public HttpResponse doProvision(@QueryParameter String configuration)
throw HttpResponses.error(SC_BAD_REQUEST, "No such Instance Configuration: " + configuration);
}

ComputeEngineInstance node = c.provision(StreamTaskListener.fromStdout());
ComputeEngineInstance node = c.provision();
if (node == null) throw HttpResponses.error(SC_BAD_REQUEST, "Could not provision new node.");
Jenkins.get().addNode(node);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,110 @@
package com.google.jenkins.plugins.computeengine;

import com.google.api.services.compute.model.Instance;
import com.google.api.services.compute.model.Scheduling;
import hudson.model.Executor;
import hudson.model.Result;
import hudson.model.TaskListener;
import hudson.remoting.Channel;
import hudson.slaves.AbstractCloudComputer;
import java.io.IOException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.logging.Level;
import java.util.logging.Logger;
import jenkins.model.CauseOfInterruption;
import lombok.extern.java.Log;
import org.kohsuke.stapler.DataBoundSetter;
import org.kohsuke.stapler.HttpRedirect;
import org.kohsuke.stapler.HttpResponse;

@Log
public class ComputeEngineComputer extends AbstractCloudComputer<ComputeEngineInstance> {

private volatile Instance instance;

private static final Logger LOGGER = Logger.getLogger(ComputeEngineCloud.class.getName());
private CompletableFuture<Boolean> preemptedFuture;

public ComputeEngineComputer(ComputeEngineInstance slave) {
super(slave);
}

@Override
public ComputeEngineInstance getNode() {
return (ComputeEngineInstance) super.getNode();
}

public void onConnected() {
void onConnected(TaskListener listener) throws IOException {
ComputeEngineInstance node = getNode();
if (node != null) {
node.onConnected();
if (getPreemptible()) {
String nodeName = node.getNodeName();
final String msg =
"Instance " + nodeName + " is preemptive, setting up preemption listener";
log.log(Level.INFO, msg);
listener.getLogger().println(msg);
preemptedFuture =
CompletableFuture.supplyAsync(() -> {
try {
final Boolean value = getChannel().callAsync(new PreemptedCheckCallable(listener)).get();
log.log(Level.INFO, "Got information that node was preempted with value [" + value + "]");
if (value) {
getChannel().close();
}
return value;
} catch (InterruptedException|ExecutionException|IOException e) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I ran the tests this code from your recent commit was reformatted, so just run mvn compile and commit the result.

throw new RuntimeException(e);
}
}, threadPoolForRemoting);
getChannel()
.addListener(
new Channel.Listener() {
@Override
public void onClosed(Channel channel, IOException cause) {
log.log(Level.FINE, "Got channel close event");
if (getPreempted()) {
log.log(
Level.FINE, "Preempted node channel closed, terminating all executors");
getExecutors().forEach(executor -> interruptExecutor(executor, nodeName));
}
}
});
}
}
}

private void interruptExecutor(Executor executor, String nodeName) {
log.log(Level.INFO, "Terminating executor " + executor + " node " + nodeName);
executor.interrupt(
Result.FAILURE,
new CauseOfInterruption() {
@Override
public String getShortDescription() {
return "Instance " + nodeName + " was preempted";
}
});
}

/**
* Check if instance is preemptible.
*
* @return true if instance was set as preemptible.
*/
public boolean getPreemptible() {
ingwarsw marked this conversation as resolved.
Show resolved Hide resolved
try {
Scheduling scheduling = getInstance().getScheduling();
return scheduling != null && scheduling.getPreemptible();
} catch (IOException e) {
log.log(Level.WARNING, "Error when getting preemptible status", e);
return false;
ingwarsw marked this conversation as resolved.
Show resolved Hide resolved
}
}

/**
* Check if instance was actually preempted.
*
* @return true if instance was preempted (we can use it to reschedule job in this case).
*/
public boolean getPreempted() {
try {
return preemptedFuture != null && preemptedFuture.isDone() && preemptedFuture.get();
} catch (InterruptedException | ExecutionException e) {
log.log(Level.WARNING, "Error when getting preempted status", e);
return false;
ingwarsw marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand Down Expand Up @@ -119,12 +196,10 @@ public HttpResponse doDoDelete() throws IOException {
ComputeEngineInstance node = getNode();
if (node != null) {
try {
ComputeEngineCloud cloud = getCloud();

node.terminate();
} catch (InterruptedException ie) {
// Termination Exception
LOGGER.log(Level.WARNING, "Node Termination Error", ie);
log.log(Level.WARNING, "Node Termination Error", ie);
}
}
return new HttpRedirect("..");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@
import hudson.model.Computer;
import hudson.model.TaskListener;
import hudson.slaves.ComputerListener;
import java.io.IOException;

@Extension
public class ComputeEngineComputerListener extends ComputerListener {
@Override
public void onOnline(Computer c, TaskListener listener) {
public void onOnline(Computer c, TaskListener listener) throws IOException {
if (c instanceof ComputeEngineComputer) {
((ComputeEngineComputer) c).onConnected();
ComputeEngineComputer computer = (ComputeEngineComputer) c;
computer.onConnected(listener);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ protected void _terminate(TaskListener listener) throws IOException, Interrupted

// If the instance is running, attempt to terminate it. This is an asynch call and we
// return immediately, hoping for the best.
cloud.getClient().terminateInstanceWithStatus(cloud.getProjectId(), zone, name, "RUNNING");
cloud.getClient().terminateInstance(cloud.getProjectId(), zone, name);
ingwarsw marked this conversation as resolved.
Show resolved Hide resolved
} catch (CloudNotFoundException cnfe) {
listener.error(cnfe.getMessage());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Copyright 2019 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.jenkins.plugins.computeengine;

import com.google.common.collect.ImmutableList;
import hudson.model.Action;
import hudson.model.Cause;
import hudson.model.CauseAction;
import hudson.model.Executor;
import hudson.model.ExecutorListener;
import hudson.model.Job;
import hudson.model.Queue;
import hudson.security.ACL;
import hudson.security.ACLContext;
import hudson.slaves.RetentionStrategy;
import java.util.List;
import java.util.logging.Level;
import jenkins.model.Jenkins;
import lombok.extern.java.Log;
import org.jenkinsci.plugins.durabletask.executors.OnceRetentionStrategy;

/**
* A strategy that allows: - setting one shot instances {@link OnceRetentionStrategy} - in case of
* preemption of GCP instance to restart preempted tasks
*/
@Log
public class ComputeEngineRetentionStrategy extends RetentionStrategy<ComputeEngineComputer>
implements ExecutorListener {
private final OnceRetentionStrategy delegate;
private final boolean oneShot;

/**
* Creates the retention strategy.
*
* @param retentionTimeMinutes Number of minutes of idleness after which to kill the slave; serves
* a backup in case the strategy fails to detect the end of a task.
* @param oneShot Create one shot instance strategy.
*/
ComputeEngineRetentionStrategy(int retentionTimeMinutes, boolean oneShot) {
this.oneShot = oneShot;
delegate = new OnceRetentionStrategy(retentionTimeMinutes);
}

@Override
public long check(ComputeEngineComputer c) {
return delegate.check(c);
ingwarsw marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
public void start(ComputeEngineComputer c) {
delegate.start(c);
}

@Override
public void taskAccepted(Executor executor, Queue.Task task) {
if (oneShot) {
delegate.taskAccepted(executor, task);
}
}

@Override
public void taskCompleted(Executor executor, Queue.Task task, long durationMS) {
if (wasPreempted(executor)) {
rescheduleTask(task);
}
if (oneShot) {
delegate.taskCompleted(executor, task, durationMS);
}
}

@Override
public void taskCompletedWithProblems(
Executor executor, Queue.Task task, long durationMS, Throwable problems) {
if (wasPreempted(executor)) {
rescheduleTask(task);
}
if (oneShot) {
delegate.taskCompletedWithProblems(executor, task, durationMS, problems);
}
}

private Queue.Task getBaseTask(Queue.Task task) {
Queue.Task parent = task.getOwnerTask();
while (task != parent) {
task = parent;
parent = task.getOwnerTask();
}
return parent;
}

private boolean wasPreempted(Executor executor) {
ComputeEngineComputer computer = (ComputeEngineComputer) executor.getOwner();
final boolean preempted = computer.getPreempted();
return preempted;
}

private void rescheduleTask(Queue.Task task) {
Queue.Task baseTask = getBaseTask(task);
log.log(Level.INFO, baseTask + " was preempted, rescheduling");
List<Action> actions = generateActionsForTask(task);
try (ACLContext notUsed = ACL.as(task.getDefaultAuthentication())) {
Jenkins.get().getQueue().schedule2(baseTask, 0, actions);
}
}

private List<Action> generateActionsForTask(Queue.Task task) {
Queue.Task baseTask = getBaseTask(task);
try {
final Job job = (Job) baseTask;
final List causes = job.getLastBuild().getCauses();
log.log(Level.FINE, "Original causes: " + causes);
} catch (Exception e) {
log.log(Level.WARNING, "Exception for " + baseTask, e);
}
return ImmutableList.of(
new CauseAction(new Cause.UserIdCause()), new CauseAction(new RebuildCause()));
}

public static class RebuildCause extends Cause {
@Override
public String getShortDescription() {
return Messages.RebuildCause_ShortDescription();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
import jenkins.model.Jenkins;
import lombok.Getter;
Expand Down
Loading