Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Terminate the instance when 404 occured. #489

Merged
merged 8 commits into from
Dec 12, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package com.google.jenkins.plugins.computeengine;

import com.google.api.client.googleapis.json.GoogleJsonResponseException;
import com.google.api.services.compute.model.AccessConfig;
import com.google.api.services.compute.model.Instance;
import com.google.api.services.compute.model.NetworkInterface;
Expand All @@ -40,6 +41,7 @@
import java.io.PrintStream;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.SocketTimeoutException;
import java.util.Base64;
import java.util.Optional;
import java.util.logging.Level;
Expand Down Expand Up @@ -154,267 +156,291 @@
}
if (opError != null) {
LOGGER.info(String.format(
"Launch failed while waiting for operation %s to complete. Operation error was %s",
"Launch failed while waiting for operation %s to complete. Operation error was %s. Terminating instance.",
insertOperationId, opError.getErrors().get(0).getMessage()));
terminateNode(computer, listener);
return;
}
} catch (InterruptedException e) {
LOGGER.info(String.format(
"Launch failed while waiting for operation %s to complete. Operation error was %s",
"Launch failed while waiting for operation %s to complete. Operation error was %s. Terminating instance",
insertOperationId, opError.getErrors().get(0).getMessage()));
terminateNode(computer, listener);
return;
}

try {
// The operation succeeded. Now wait for the Instance status to be RUNNING
OUTER:
while (true) {
switch (computer.getInstanceStatus()) {
case "PROVISIONING":
case "STAGING":
cloud.log(
LOGGER,
Level.FINEST,
listener,
String.format("Instance %s is being prepared...", computer.getName()));
break;
case "RUNNING":
cloud.log(
LOGGER,
Level.FINEST,
listener,
String.format("Instance %s is running and ready...", computer.getName()));
break OUTER;
case "STOPPING":
case "SUSPENDING":
case "TERMINATED":
cloud.log(
LOGGER,
Level.FINEST,
listener,
String.format("Instance %s is being shut down...", computer.getName()));
break;
// TODO: Although the plugin doesn't put instances in the STOPPED or SUSPENDED states,
// it should handle them if they are placed in that state out-of-band.
case "STOPPED":
case "SUSPENDED":
cloud.log(
LOGGER,
Level.FINEST,
listener,
String.format(
"Instance %s was unexpectedly stopped or suspended...", computer.getName()));
return;
}
Thread.sleep(5000);
}

// Initiate the next launch phase. This is likely an SSH-based process for Linux hosts.
computer.refreshInstance();
launch(computer, listener);
} catch (IOException ioe) {
ioe.printStackTrace(listener.error(ioe.getMessage()));
node = (ComputeEngineInstance) slaveComputer.getNode();
if (node != null) {
try {
node.terminate();
} catch (Exception e) {
listener.error(String.format("Failed to terminate node %s", node.getDisplayName()));
}
}
terminateNode(slaveComputer, listener);
} catch (InterruptedException ie) {

}
}

private static void terminateNode(SlaveComputer slaveComputer, TaskListener listener) {
ComputeEngineInstance node = (ComputeEngineInstance) slaveComputer.getNode();
if (node != null) {
Artmorse marked this conversation as resolved.
Show resolved Hide resolved
try {
node.terminate();
} catch (Exception e) {
listener.error(String.format("Failed to terminate node %s", node.getDisplayName()));
}
}
}

private boolean testCommand(
ComputeEngineComputer computer,
Connection conn,
String checkCommand,
PrintStream logger,
TaskListener listener)
throws IOException, InterruptedException {
logInfo(computer, listener, "Verifying: " + checkCommand);
return conn.exec(checkCommand, logger) == 0;
}

protected abstract Optional<Connection> setupConnection(
ComputeEngineInstance node, ComputeEngineComputer computer, TaskListener listener) throws Exception;

protected abstract String getPathSeparator();

private boolean checkJavaInstalled(
ComputeEngineComputer computer,
Connection conn,
PrintStream logger,
TaskListener listener,
String javaExecPath) {
try {
if (testCommand(computer, conn, String.format("%s -fullversion", javaExecPath), logger, listener)) {
return true;
}
} catch (IOException | InterruptedException ex) {
logException(computer, listener, "Failed to check java: ", ex);
return false;
}

logWarning(computer, listener, String.format("Java is not installed at %s", javaExecPath));
return false;
}

private void copyAgentJar(ComputeEngineComputer computer, Connection conn, TaskListener listener, String jenkinsDir)
throws IOException {
SCPClient scp = conn.createSCPClient();
logInfo(computer, listener, "Copying agent.jar to: " + jenkinsDir);
scp.put(Jenkins.get().getJnlpJars(AGENT_JAR).readFully(), AGENT_JAR, jenkinsDir);
}

private String getJavaLaunchString(String javaExecPath, String jenkinsDir) {
return String.format("%s -jar %s%s%s", javaExecPath, jenkinsDir, getPathSeparator(), AGENT_JAR);
}

private void launch(ComputeEngineComputer computer, TaskListener listener) {
ComputeEngineInstance node = computer.getNode();
if (node == null) {
logWarning(computer, listener, "Could not get node from computer");
return;
}

Connection conn = null;
Optional<Connection> cleanupConn;
PrintStream logger = listener.getLogger();
logInfo(computer, listener, "Launching instance: " + node.getNodeName());
Session sess = null;
try {
cleanupConn = setupConnection(node, computer, listener);
if (!cleanupConn.isPresent()) {
return;
}
conn = cleanupConn.get();
String javaExecPath = node.getJavaExecPathOrDefault();
if (!checkJavaInstalled(computer, conn, logger, listener, javaExecPath)) {
return;
}
String jenkinsDir = node.getRemoteFS();
copyAgentJar(computer, conn, listener, jenkinsDir);
String launchString = getJavaLaunchString(javaExecPath, jenkinsDir);
logInfo(computer, listener, "Launching Jenkins agent via plugin SSH: " + launchString);
sess = conn.openSession();
sess.execCommand(launchString);
Session finalSess = sess;
Connection finalConn = conn;
computer.setChannel(sess.getStdout(), sess.getStdin(), logger, new Channel.Listener() {
@Override
public void onClosed(Channel channel, IOException cause) {
finalSess.close();
finalConn.close();
}
});
} catch (Exception e) {
if (sess != null) {
sess.close();
}
if (conn != null) {
conn.close();
}
logException(computer, listener, "Error: ", e);
}
}

protected Connection connectToSsh(ComputeEngineComputer computer, TaskListener listener) throws Exception {
ComputeEngineInstance node = computer.getNode();
if (node == null) {
throw new IllegalArgumentException("A ComputeEngineComputer with no node was provided");
}

ComputeClient client = node.getCloud().getClient();
final long timeout = node.getLaunchTimeoutMillis();
final long startTime = System.currentTimeMillis();
Connection conn = null;
while (true) {
try {
long waitTime = System.currentTimeMillis() - startTime;
if (timeout > 0 && waitTime > timeout) {
// TODO(google-compute-engine-plugin/issues/135): better exception
throw new Exception("Timed out after "
+ (waitTime / 1000)
+ " seconds of waiting for ssh to become available. (maximum timeout configured is "
+ (timeout / 1000)
+ ")");
}
Instance instance = computer.refreshInstance();
// the instance will be null when the node is terminated
if (instance == null) {
return null;
}

String host = "";

// TODO(google-compute-engine-plugin/issues/136): handle multiple NICs
NetworkInterface nic = instance.getNetworkInterfaces().get(0);

if (this.useInternalAddress) {
host = nic.getNetworkIP();
} else {
// Look for a public IPv4 address
if (nic.getAccessConfigs() != null) {
for (AccessConfig ac : nic.getAccessConfigs()) {
if (ac.getType().equals(NetworkInterfaceIpStackMode.NAT_TYPE)) {
host = ac.getNatIP();
}
}
}
// Look for a public IPv6 address
// TODO: IPv6 address is preferred compared to IPv4, we could let the user select
// his preferences to prioritize them.
if (nic.getIpv6AccessConfigs() != null) {
for (AccessConfig ac : nic.getIpv6AccessConfigs()) {
if (ac.getType().equals(NetworkInterfaceDualStack.IPV6_TYPE)) {
host = ac.getExternalIpv6();
}
}
}
// No public address found. Fall back to internal address
if (host.isEmpty()) {
host = nic.getNetworkIP();
logInfo(computer, listener, "No public address found. Fall back to internal address.");
}
}

int port = SSH_PORT;
logInfo(
computer,
listener,
"Connecting to " + host + " on port " + port + ", with timeout " + SSH_TIMEOUT_MILLIS + ".");
conn = new Connection(host, port);
ProxyConfiguration proxyConfig = Jenkins.get().proxy;
Proxy proxy = proxyConfig == null ? Proxy.NO_PROXY : proxyConfig.createProxy(host);
if (!node.isIgnoreProxy()
&& !proxy.equals(Proxy.NO_PROXY)
&& proxy.address() instanceof InetSocketAddress) {
InetSocketAddress address = (InetSocketAddress) proxy.address();
HTTPProxyData proxyData = null;
if (proxyConfig.getUserName() != null && proxyConfig.getPassword() != null) {
proxyData = new HTTPProxyData(
address.getHostName(),
address.getPort(),
proxyConfig.getUserName(),
proxyConfig.getPassword());
} else {
proxyData = new HTTPProxyData(address.getHostName(), address.getPort());
}
conn.setProxyData(proxyData);
logInfo(computer, listener, "Using HTTP Proxy Configuration");
}

conn.connect(
(hostname, portNum, serverHostKeyAlgorithm, serverHostKey) -> verifyServerHostKey(
client, computer, listener, instance, serverHostKeyAlgorithm, serverHostKey),
SSH_TIMEOUT_MILLIS,
SSH_TIMEOUT_MILLIS);
logInfo(computer, listener, "Connected via SSH.");
return conn;
} catch (IOException e) {
} catch (GoogleJsonResponseException e) {
if (e.getStatusCode() == 404) {
log(
LOGGER,
Level.SEVERE,
listener,
String.format("Instance %s not found. Terminating instance.", computer.getName()));
terminateNode(computer, listener);
}
} catch (SocketTimeoutException e) {
// keep retrying until SSH comes up
logInfo(computer, listener, "Failed to connect via ssh: " + e.getMessage());
logInfo(computer, listener, "Waiting for SSH to come up. Sleeping 5.");
logInfo(computer, listener, String.format("Failed to connect via ssh: %s", e.getMessage()));
logInfo(
computer,
listener,
String.format("Waiting for SSH to come up. Sleeping %d.", SSH_SLEEP_MILLIS / 1000));
Thread.sleep(SSH_SLEEP_MILLIS);
} catch (IOException e) {
Artmorse marked this conversation as resolved.
Show resolved Hide resolved
logWarning(computer, listener, String.format("An error occured: %s", e.getMessage()));

Check warning on line 443 in src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered lines

Lines 161-443 are not covered by tests
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@

package com.google.jenkins.plugins.computeengine;

import static com.google.jenkins.plugins.computeengine.ComputeEngineCloud.CLOUD_ID_LABEL_KEY;

import com.google.cloud.graphite.platforms.plugin.client.ComputeClient.OperationException;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import com.google.jenkins.plugins.computeengine.ssh.GoogleKeyCredential;
import edu.umd.cs.findbugs.annotations.Nullable;
import hudson.Extension;
Expand All @@ -30,6 +33,7 @@
import hudson.slaves.RetentionStrategy;
import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
Expand Down Expand Up @@ -130,9 +134,16 @@
.createSnapshotSync(cloud.getProjectId(), this.zone, this.getNodeName(), createSnapshotTimeout);
}

// If the instance is running, attempt to terminate it. This is an async call and we
Map<String, String> filterLabel = ImmutableMap.of(CLOUD_ID_LABEL_KEY, cloud.getInstanceId());
var instanceExistsInCloud =
cloud.getClient().listInstancesWithLabel(cloud.getProjectId(), filterLabel).stream()
.anyMatch(instance -> instance.getName().equals(name));

// If the instance exists in the cloud, attempt to terminate it. This is an async call and we
// return immediately, hoping for the best.
cloud.getClient().terminateInstanceAsync(cloud.getProjectId(), zone, name);
if (instanceExistsInCloud) {

Check warning on line 144 in src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered lines

Lines 137-144 are not covered by tests
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this needed? Is it just to avoid an unnecessary error message in the logs in case the instance does not exist in the cloud?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is based on another branch (see #470) but I don't think this condition is useful. Theoretically we could even verify the instance exists in the cloud but when running the terminateInstanceAsync the instance could have been removed. I will remove that part.

Please @Theoderich if we're missing cases, please tell us.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done into f6fc88c

Copy link
Contributor

@Theoderich Theoderich Dec 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is part of the fix for #471 and should not be removed.
The Problem:
When Jenkins tries to start a node and the quotas in the target cloud do not allow it, the instance is not started in the cloud but Jenkins thinks that a node was started. Now there is a Zombie node in Jenkins.

When trying to delete this Zombie node, Jenkins calls cloud.getClient().terminateInstanceAsync which throws an Exception since the Instance does not exist in the cloud. The Delete is aborted and the zombie node stays, forever undeletable.

The Solution:
I added a check if the node actually exists in the cloud before calling cloud.getClient().terminateInstanceAsync. If the node does not exist as an instance in the cloud, there is no need to terminate it and the deletion can proceed.

Why not just catch the exception thrown by cloud.getClient().terminateInstanceAsync?
It is not clear from the exception why the terminate operation failed. If the instance exists in the cloud and the termination failed for some other reason, we do not want to delete the node in Jenkins.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, thanks for the detailed explanation.
I've reverted the changes.

cloud.getClient().terminateInstanceAsync(cloud.getProjectId(), zone, name);
}
} catch (CloudNotFoundException cnfe) {
listener.error(cnfe.getMessage());
} catch (OperationException oe) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@
logInfo(computer, listener, "Authenticating as " + node.getSshUser());
try {
bootstrapConn = connectToSsh(computer, listener);
if (bootstrapConn == null) {
break;

Check warning on line 89 in src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineLinuxLauncher.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered lines

Lines 88-89 are not covered by tests
}
isAuthenticated = bootstrapConn.authenticateWithPublicKey(
node.getSshUser(),
Secret.toString(keyCred.getPrivateKey()).toCharArray(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@
logInfo(computer, listener, "Authenticating as " + node.getSshUser());
try {
bootstrapConn = connectToSsh(computer, listener);
if (bootstrapConn == null) {
break;

Check warning on line 96 in src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineWindowsLauncher.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered lines

Lines 95-96 are not covered by tests
}
isAuthenticated = authenticateSSH(node.getSshUser(), windowsConfig, bootstrapConn, listener);
} catch (IOException e) {
logException(computer, listener, "Exception trying to authenticate", e);
Expand Down