From b14ecd1e2cffb4c41c3f95290b929213f9ac518c Mon Sep 17 00:00:00 2001 From: Andreas Janning Date: Fri, 6 Sep 2024 11:18:38 +0200 Subject: [PATCH 1/8] Make sure an instance exists in the cloud before trying to delete it. Else jenkins nodes cannot be deleted if there is no corresponding instance in the cloud. --- .../computeengine/ComputeEngineInstance.java | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java index ec0f92a2..6bf1bd8e 100644 --- a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java +++ b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java @@ -16,8 +16,11 @@ package com.google.jenkins.plugins.computeengine; +import static com.google.jenkins.plugins.computeengine.ComputeEngineCloud.CLOUD_ID_LABEL_KEY; + import com.google.cloud.graphite.platforms.plugin.client.ComputeClient.OperationException; import com.google.common.base.Strings; +import com.google.common.collect.ImmutableMap; import com.google.jenkins.plugins.computeengine.ssh.GoogleKeyCredential; import edu.umd.cs.findbugs.annotations.Nullable; import hudson.Extension; @@ -30,6 +33,7 @@ import hudson.slaves.RetentionStrategy; import java.io.IOException; import java.util.Collections; +import java.util.Map; import java.util.Optional; import java.util.logging.Level; import java.util.logging.Logger; @@ -130,9 +134,16 @@ protected void _terminate(TaskListener listener) throws IOException, Interrupted .createSnapshotSync(cloud.getProjectId(), this.zone, this.getNodeName(), createSnapshotTimeout); } - // If the instance is running, attempt to terminate it. This is an async call and we + Map filterLabel = ImmutableMap.of(CLOUD_ID_LABEL_KEY, cloud.getInstanceId()); + var instanceExistsInCloud = + cloud.getClient().listInstancesWithLabel(cloud.getProjectId(), filterLabel).stream() + .anyMatch(instance -> instance.getName().equals(name)); + + // If the instance exists in the cloud, attempt to terminate it. This is an async call and we // return immediately, hoping for the best. - cloud.getClient().terminateInstanceAsync(cloud.getProjectId(), zone, name); + if (instanceExistsInCloud) { + cloud.getClient().terminateInstanceAsync(cloud.getProjectId(), zone, name); + } } catch (CloudNotFoundException cnfe) { listener.error(cnfe.getMessage()); } catch (OperationException oe) { From 5ba199e91981fb6292f46eb13a87663dc769e9ab Mon Sep 17 00:00:00 2001 From: Andreas Janning Date: Fri, 6 Sep 2024 11:19:27 +0200 Subject: [PATCH 2/8] Terminate instances immediately if they fail to launch. This gets rid of zombie offline nodes in jenkins that failed to start in the cloud. --- .../ComputeEngineComputerLauncher.java | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java index b0b0be1c..3bd94c50 100644 --- a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java +++ b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java @@ -154,14 +154,16 @@ public void launch(SlaveComputer slaveComputer, TaskListener listener) { } if (opError != null) { LOGGER.info(String.format( - "Launch failed while waiting for operation %s to complete. Operation error was %s", + "Launch failed while waiting for operation %s to complete. Operation error was %s. Terminating instance.", insertOperationId, opError.getErrors().get(0).getMessage())); + terminateNode(computer, listener); return; } } catch (InterruptedException e) { LOGGER.info(String.format( - "Launch failed while waiting for operation %s to complete. Operation error was %s", + "Launch failed while waiting for operation %s to complete. Operation error was %s. Terminating instance", insertOperationId, opError.getErrors().get(0).getMessage())); + terminateNode(computer, listener); return; } @@ -214,19 +216,23 @@ public void launch(SlaveComputer slaveComputer, TaskListener listener) { launch(computer, listener); } catch (IOException ioe) { ioe.printStackTrace(listener.error(ioe.getMessage())); - node = (ComputeEngineInstance) slaveComputer.getNode(); - if (node != null) { - try { - node.terminate(); - } catch (Exception e) { - listener.error(String.format("Failed to terminate node %s", node.getDisplayName())); - } - } + terminateNode(slaveComputer, listener); } catch (InterruptedException ie) { } } + private static void terminateNode(SlaveComputer slaveComputer, TaskListener listener) { + ComputeEngineInstance node = (ComputeEngineInstance) slaveComputer.getNode(); + if (node != null) { + try { + node.terminate(); + } catch (Exception e) { + listener.error(String.format("Failed to terminate node %s", node.getDisplayName())); + } + } + } + private boolean testCommand( ComputeEngineComputer computer, Connection conn, From ec6f4b3b126e47da723a94aca14c975a36a68eff Mon Sep 17 00:00:00 2001 From: Arthur Caron Date: Thu, 28 Nov 2024 15:19:11 +0100 Subject: [PATCH 3/8] Terminate instance when not found. --- .../ComputeEngineComputerLauncher.java | 26 ++++++++++++++++--- .../ComputeEngineLinuxLauncher.java | 3 +++ .../ComputeEngineWindowsLauncher.java | 3 +++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java index 3bd94c50..aa1d44f1 100644 --- a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java +++ b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java @@ -16,6 +16,7 @@ package com.google.jenkins.plugins.computeengine; +import com.google.api.client.googleapis.json.GoogleJsonResponseException; import com.google.api.services.compute.model.AccessConfig; import com.google.api.services.compute.model.Instance; import com.google.api.services.compute.model.NetworkInterface; @@ -40,6 +41,7 @@ import java.io.PrintStream; import java.net.InetSocketAddress; import java.net.Proxy; +import java.net.SocketTimeoutException; import java.util.Base64; import java.util.Optional; import java.util.logging.Level; @@ -349,6 +351,10 @@ protected Connection connectToSsh(ComputeEngineComputer computer, TaskListener l + ")"); } Instance instance = computer.refreshInstance(); + // the instance will be null when the node is terminated + if (instance == null) { + return null; + } String host = ""; @@ -416,11 +422,25 @@ protected Connection connectToSsh(ComputeEngineComputer computer, TaskListener l SSH_TIMEOUT_MILLIS); logInfo(computer, listener, "Connected via SSH."); return conn; - } catch (IOException e) { + } catch (GoogleJsonResponseException e) { + if (e.getStatusCode() == 404) { + log( + LOGGER, + Level.SEVERE, + listener, + String.format("Instance %s not found. Terminating instance.", computer.getName())); + terminateNode(computer, listener); + } + } catch (SocketTimeoutException e) { // keep retrying until SSH comes up - logInfo(computer, listener, "Failed to connect via ssh: " + e.getMessage()); - logInfo(computer, listener, "Waiting for SSH to come up. Sleeping 5."); + logInfo(computer, listener, String.format("Failed to connect via ssh: %s", e.getMessage())); + logInfo( + computer, + listener, + String.format("Waiting for SSH to come up. Sleeping %d.", SSH_SLEEP_MILLIS / 1000)); Thread.sleep(SSH_SLEEP_MILLIS); + } catch (IOException e) { + logWarning(computer, listener, String.format("An error occured: %s", e.getMessage())); } } } diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineLinuxLauncher.java b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineLinuxLauncher.java index 928f3c38..50fc679c 100644 --- a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineLinuxLauncher.java +++ b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineLinuxLauncher.java @@ -85,6 +85,9 @@ private Optional bootstrap( logInfo(computer, listener, "Authenticating as " + node.getSshUser()); try { bootstrapConn = connectToSsh(computer, listener); + if (bootstrapConn == null) { + break; + } isAuthenticated = bootstrapConn.authenticateWithPublicKey( node.getSshUser(), Secret.toString(keyCred.getPrivateKey()).toCharArray(), diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineWindowsLauncher.java b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineWindowsLauncher.java index 30fae98e..317eeeac 100644 --- a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineWindowsLauncher.java +++ b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineWindowsLauncher.java @@ -92,6 +92,9 @@ private Optional bootstrap(ComputeEngineComputer computer, TaskListe logInfo(computer, listener, "Authenticating as " + node.getSshUser()); try { bootstrapConn = connectToSsh(computer, listener); + if (bootstrapConn == null) { + break; + } isAuthenticated = authenticateSSH(node.getSshUser(), windowsConfig, bootstrapConn, listener); } catch (IOException e) { logException(computer, listener, "Exception trying to authenticate", e); From 627fda8ab8fd40a57386ac738ae4f759f1ff1af1 Mon Sep 17 00:00:00 2001 From: Arthur Caron Date: Mon, 9 Dec 2024 18:05:02 +0100 Subject: [PATCH 4/8] Sleep when IOException occurs. --- .../plugins/computeengine/ComputeEngineComputerLauncher.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java index aa1d44f1..975b15bd 100644 --- a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java +++ b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java @@ -441,6 +441,7 @@ protected Connection connectToSsh(ComputeEngineComputer computer, TaskListener l Thread.sleep(SSH_SLEEP_MILLIS); } catch (IOException e) { logWarning(computer, listener, String.format("An error occured: %s", e.getMessage())); + Thread.sleep(SSH_SLEEP_MILLIS); } } } From f6fc88ceb54d90dd77459fc4c5e20141aecc3858 Mon Sep 17 00:00:00 2001 From: Arthur Caron Date: Mon, 9 Dec 2024 18:13:30 +0100 Subject: [PATCH 5/8] ComputeEnginerInstance terminate instance without checking if it still exists. --- .../plugins/computeengine/ComputeEngineInstance.java | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java index 6bf1bd8e..db3cbea5 100644 --- a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java +++ b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java @@ -134,16 +134,8 @@ protected void _terminate(TaskListener listener) throws IOException, Interrupted .createSnapshotSync(cloud.getProjectId(), this.zone, this.getNodeName(), createSnapshotTimeout); } - Map filterLabel = ImmutableMap.of(CLOUD_ID_LABEL_KEY, cloud.getInstanceId()); - var instanceExistsInCloud = - cloud.getClient().listInstancesWithLabel(cloud.getProjectId(), filterLabel).stream() - .anyMatch(instance -> instance.getName().equals(name)); - - // If the instance exists in the cloud, attempt to terminate it. This is an async call and we // return immediately, hoping for the best. - if (instanceExistsInCloud) { - cloud.getClient().terminateInstanceAsync(cloud.getProjectId(), zone, name); - } + cloud.getClient().terminateInstanceAsync(cloud.getProjectId(), zone, name); } catch (CloudNotFoundException cnfe) { listener.error(cnfe.getMessage()); } catch (OperationException oe) { From ce2791c144537bd34b83b04c159b5e7cb471c9f3 Mon Sep 17 00:00:00 2001 From: Arthur Caron Date: Wed, 11 Dec 2024 10:52:48 +0100 Subject: [PATCH 6/8] Fine log added in `terminateNode` method. --- .../plugins/computeengine/ComputeEngineComputerLauncher.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java index 975b15bd..8278ef77 100644 --- a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java +++ b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java @@ -232,6 +232,8 @@ private static void terminateNode(SlaveComputer slaveComputer, TaskListener list } catch (Exception e) { listener.error(String.format("Failed to terminate node %s", node.getDisplayName())); } + } else { + LOGGER.fine(String.format("Tried to terminate unknown node from computer %s", node.getNodeName())); } } From d811f6c64b5c52f056cef4169db5c7e98619cb9e Mon Sep 17 00:00:00 2001 From: Arthur Caron Date: Wed, 11 Dec 2024 11:23:54 +0100 Subject: [PATCH 7/8] Revert "ComputeEnginerInstance terminate instance without checking if it still exists." This reverts commit f6fc88ceb54d90dd77459fc4c5e20141aecc3858. --- .../plugins/computeengine/ComputeEngineInstance.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java index db3cbea5..6bf1bd8e 100644 --- a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java +++ b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineInstance.java @@ -134,8 +134,16 @@ protected void _terminate(TaskListener listener) throws IOException, Interrupted .createSnapshotSync(cloud.getProjectId(), this.zone, this.getNodeName(), createSnapshotTimeout); } + Map filterLabel = ImmutableMap.of(CLOUD_ID_LABEL_KEY, cloud.getInstanceId()); + var instanceExistsInCloud = + cloud.getClient().listInstancesWithLabel(cloud.getProjectId(), filterLabel).stream() + .anyMatch(instance -> instance.getName().equals(name)); + + // If the instance exists in the cloud, attempt to terminate it. This is an async call and we // return immediately, hoping for the best. - cloud.getClient().terminateInstanceAsync(cloud.getProjectId(), zone, name); + if (instanceExistsInCloud) { + cloud.getClient().terminateInstanceAsync(cloud.getProjectId(), zone, name); + } } catch (CloudNotFoundException cnfe) { listener.error(cnfe.getMessage()); } catch (OperationException oe) { From 181e799476d016f369023d47587fcfac1716b524 Mon Sep 17 00:00:00 2001 From: Arthur Caron Date: Thu, 12 Dec 2024 15:13:04 +0100 Subject: [PATCH 8/8] Fine log added in `terminateNode` method. --- .../plugins/computeengine/ComputeEngineComputerLauncher.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java index 8278ef77..62245006 100644 --- a/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java +++ b/src/main/java/com/google/jenkins/plugins/computeengine/ComputeEngineComputerLauncher.java @@ -233,7 +233,8 @@ private static void terminateNode(SlaveComputer slaveComputer, TaskListener list listener.error(String.format("Failed to terminate node %s", node.getDisplayName())); } } else { - LOGGER.fine(String.format("Tried to terminate unknown node from computer %s", node.getNodeName())); + LOGGER.fine( + String.format("Tried to terminate unknown node from computer %s", slaveComputer.getDisplayName())); } }