From 522a022ae961d31b7b4346f90d04c3d08115e7f7 Mon Sep 17 00:00:00 2001 From: Kohsuke Kawaguchi Date: Thu, 20 Oct 2016 16:55:39 +0200 Subject: [PATCH 1/8] [JENKINS-39150] expose diagnostics across all the channels To be used by support-core, we need to be able to enumerate all active channels. We do this via WeakHashMap so that references get automatically garbage collected. Unclosed channel will remain in memory forever, which also helps us find those leaks. --- src/main/java/hudson/remoting/Channel.java | 60 +++++++++++++++++++ .../java/hudson/remoting/ChannelTest.java | 12 ++++ 2 files changed, 72 insertions(+) diff --git a/src/main/java/hudson/remoting/Channel.java b/src/main/java/hudson/remoting/Channel.java index 601cc1a54..d7745a53d 100644 --- a/src/main/java/hudson/remoting/Channel.java +++ b/src/main/java/hudson/remoting/Channel.java @@ -45,6 +45,7 @@ import java.lang.ref.WeakReference; import java.net.URL; import java.util.Collections; +import java.util.Date; import java.util.Hashtable; import java.util.Locale; import java.util.Map; @@ -204,8 +205,32 @@ public class Channel implements VirtualChannel, IChannel, Closeable { */ private final Vector listeners = new Vector(); private int gcCounter; + + /** + * Number of {@link Command} objects sent to the other side. + */ private int commandsSent; + /** + * Number of {@link Command} objects received from the other side. + * + * When a transport is functioning correctly, {@link #commandsSent} of one side + * and {@link #commandsReceived} of the other side should closely match. + */ + private int commandsReceived; + + /** + * Timestamp of the last {@link Command} object sent/received, in + * {@link System#currentTimeMillis()} format. + */ + private long lastCommandSent, lastCommandReceived; + + /** + * Timestamp of when this channel was connected/created, in + * {@link System#currentTimeMillis()} format. + */ + private final long createdAt = System.currentTimeMillis(); + /** * Total number of nanoseconds spent for remote class loading. *

@@ -494,6 +519,8 @@ protected Channel(ChannelBuilder settings, CommandTransport transport) throws IO transport.setup(this, new CommandReceiver() { public void handle(Command cmd) { + commandsReceived++; + lastCommandReceived = System.currentTimeMillis(); updateLastHeard(); if (logger.isLoggable(Level.FINE)) logger.fine("Received " + cmd); @@ -509,6 +536,7 @@ public void terminate(IOException e) { Channel.this.terminate(e); } }); + ACTIVE_CHANNELS.put(this,ref()); } /** @@ -581,6 +609,7 @@ private ExecutorService createPipeWriterExecutor() { transport.write(cmd, cmd instanceof CloseCommand); commandsSent++; + lastCommandSent = System.currentTimeMillis(); } /** @@ -1148,6 +1177,20 @@ public void dumpPerformanceCounters(PrintWriter w) throws IOException { w.printf(Locale.ENGLISH, "Resource loading time=%,dms%n", resourceLoadingTime.get() / (1000 * 1000)); } + /** + * Print the diagnostic information. + */ + public void dumpDiagnostics(PrintWriter w) throws IOException { + w.printf("Channel %s%n",name); + w.printf(" Created=%s%n", new Date(createdAt)); + w.printf(" Commands sent=%d%n", commandsSent); + w.printf(" Commands received=%d%n", commandsReceived); + w.printf(" Last command sent=%s%n", new Date(lastCommandSent)); + w.printf(" Last command received=%s%n", new Date(lastCommandReceived)); + w.printf(" Pending outgoing calls=%d%n", pendingCalls.size()); + w.printf(" Pending incoming calls=%d%n", pendingCalls.size()); + } + /** * {@inheritDoc} */ @@ -1498,6 +1541,18 @@ public static Channel current() { return CURRENT.get(); } + /** + * Calls {@link #dumpDiagnostics(PrintWriter)} across all the active channels in this system. + * Used for diagnostics. + */ + public static void dumpDiagnosticsForAll(PrintWriter w) throws IOException { + for (Ref ref : ACTIVE_CHANNELS.values()) { + Channel ch = ref.channel(); + if (ch!=null) + ch.dumpDiagnostics(w); + } + } + /** * Remembers the current "channel" associated for this thread. */ @@ -1522,6 +1577,11 @@ public static Channel current() { */ public static final int PIPE_WINDOW_SIZE = Integer.getInteger(Channel.class.getName()+".pipeWindowSize",1024*1024); + /** + * Keep track of active channels in the system for diagnostics purposes. + */ + private static final Map ACTIVE_CHANNELS = Collections.synchronizedMap(new WeakHashMap()); + static final Class jarLoaderProxy; static { diff --git a/src/test/java/hudson/remoting/ChannelTest.java b/src/test/java/hudson/remoting/ChannelTest.java index b4581eeb9..08872ac58 100644 --- a/src/test/java/hudson/remoting/ChannelTest.java +++ b/src/test/java/hudson/remoting/ChannelTest.java @@ -5,7 +5,9 @@ import java.io.IOException; import java.io.ObjectInputStream; +import java.io.PrintWriter; import java.io.Serializable; +import java.io.StringWriter; import java.net.URL; import java.net.URLClassLoader; import java.util.concurrent.TimeUnit; @@ -163,4 +165,14 @@ public T call() throws RuntimeException { return t; } } + + public void testDiagnostics() throws Exception { + StringWriter sw = new StringWriter(); + Channel.dumpDiagnosticsForAll(new PrintWriter(sw)); + System.out.println(sw); + assertTrue(sw.toString().contains("Channel north")); + assertTrue(sw.toString().contains("Channel south")); + assertTrue(sw.toString().contains("Commands sent=0")); + assertTrue(sw.toString().contains("Commands received=0")); + } } From 7c2944a012428b8a4c2d1f884b7ae84b06e23c7f Mon Sep 17 00:00:00 2001 From: Kohsuke Kawaguchi Date: Fri, 21 Oct 2016 11:27:45 +0200 Subject: [PATCH 2/8] Explaining what these stats mean ... and see how they can be interpreted to aid diagnostics --- src/main/java/hudson/remoting/Channel.java | 52 ++++++++++++++++++++++ src/main/java/hudson/remoting/Command.java | 2 + 2 files changed, 54 insertions(+) diff --git a/src/main/java/hudson/remoting/Channel.java b/src/main/java/hudson/remoting/Channel.java index d7745a53d..7936cc34a 100644 --- a/src/main/java/hudson/remoting/Channel.java +++ b/src/main/java/hudson/remoting/Channel.java @@ -32,6 +32,7 @@ import hudson.remoting.forward.PortForwarder; import org.jenkinsci.remoting.CallableDecorator; import org.jenkinsci.remoting.RoleChecker; +import org.jenkinsci.remoting.nio.NioChannelHub; import javax.annotation.CheckForNull; import javax.annotation.Nonnull; @@ -1179,6 +1180,57 @@ public void dumpPerformanceCounters(PrintWriter w) throws IOException { /** * Print the diagnostic information. + * + *

+ * Here's how you interpret these metrics: + * + *

+ *
Created + *
+ * When the channel was created, which is more or less the same thing as when the channel is connected. + * + *
Commands sent + *
+ * Number of {@link Command} objects sent to the other side. More specifically, number of commands + * successfully passed to {@link #transport}, which means data was written to socket with + * {@link ClassicCommandTransport} but just buffered for write with {@link NioChannelHub}. + * + * If you have access to the remoting diagnostics + * of the other side of the channel, then you can compare this with "commandsReceived" metrics of the other + * side to see how many commands are in transit in transport. If {@code commandsSent==commandsReceived}, + * then no command is in transit. + * + *
Commands received + *
+ * Number of {@link Command} objects received from the other side. More precisely, number + * of commands reported from {@link #transport}. So for example, if data is still buffered somewhere + * in the networking stack, it won't be counted here. + * + *
Last command sent + *
+ * The timestamp in which the last command was sent to the other side. The timestamp in which + * {@link #lastCommandSent} was updated. + * + *
Last command received + *
+ * The timestamp in which the last command was sent to the other side. The timestamp in which + * {@link #lastCommandReceived} was updated. + * + *
Pending outgoing calls + *
+ * Number of RPC calls (e.g., method call through a {@linkplain RemoteInvocationHandler proxy}) + * that are made but not returned yet. If you have the remoting diagnostics of the other side, you + * can compare this number with "pending incoming calls" on the other side to see how many RPC + * calls are executing vs in flight. "one side's incoming calls" < "the other side's outgoing calls" + * indicates some RPC requests or responses are passing through the network layer, and mismatch + * between "# of commands sent" vs "# of commands received" can give some indication of whether + * it is request or response that's in flight. + * + *
Pending incoming calls + *
+ * The reverse of "pending outgoing calls". + * Number of RPC calls (e.g., method call through a {@linkplain RemoteInvocationHandler proxy}) + * that the other side has made to us but not yet returned yet. */ public void dumpDiagnostics(PrintWriter w) throws IOException { w.printf("Channel %s%n",name); diff --git a/src/main/java/hudson/remoting/Command.java b/src/main/java/hudson/remoting/Command.java index 87d60b697..cce690952 100644 --- a/src/main/java/hudson/remoting/Command.java +++ b/src/main/java/hudson/remoting/Command.java @@ -31,6 +31,8 @@ /** * One-way command to be sent over to the remote system and executed there. + * This is the smallest unit of message in remoting from one side to another, + * such as "please execute this method" or "here's the return value from an earlier method call". * This is layer 0, the lower most layer. * *

From 29df0f3bb49fb9457c31ce5e48029598f0721e63 Mon Sep 17 00:00:00 2001 From: Kohsuke Kawaguchi Date: Fri, 21 Oct 2016 11:31:28 +0200 Subject: [PATCH 3/8] Linking the ticket --- src/test/java/hudson/remoting/ChannelTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/test/java/hudson/remoting/ChannelTest.java b/src/test/java/hudson/remoting/ChannelTest.java index 08872ac58..bbd1e8819 100644 --- a/src/test/java/hudson/remoting/ChannelTest.java +++ b/src/test/java/hudson/remoting/ChannelTest.java @@ -166,6 +166,7 @@ public T call() throws RuntimeException { } } + @Bug(39150) public void testDiagnostics() throws Exception { StringWriter sw = new StringWriter(); Channel.dumpDiagnosticsForAll(new PrintWriter(sw)); From f27001cc4d1236e12769c57b44aa0476b5693d2b Mon Sep 17 00:00:00 2001 From: Kohsuke Kawaguchi Date: Fri, 21 Oct 2016 11:32:03 +0200 Subject: [PATCH 4/8] Matching the other naming convention --- src/main/java/hudson/remoting/Channel.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/java/hudson/remoting/Channel.java b/src/main/java/hudson/remoting/Channel.java index 7936cc34a..021199f01 100644 --- a/src/main/java/hudson/remoting/Channel.java +++ b/src/main/java/hudson/remoting/Channel.java @@ -224,7 +224,7 @@ public class Channel implements VirtualChannel, IChannel, Closeable { * Timestamp of the last {@link Command} object sent/received, in * {@link System#currentTimeMillis()} format. */ - private long lastCommandSent, lastCommandReceived; + private long lastCommandSentAt, lastCommandReceivedAt; /** * Timestamp of when this channel was connected/created, in @@ -521,7 +521,7 @@ protected Channel(ChannelBuilder settings, CommandTransport transport) throws IO transport.setup(this, new CommandReceiver() { public void handle(Command cmd) { commandsReceived++; - lastCommandReceived = System.currentTimeMillis(); + lastCommandReceivedAt = System.currentTimeMillis(); updateLastHeard(); if (logger.isLoggable(Level.FINE)) logger.fine("Received " + cmd); @@ -610,7 +610,7 @@ private ExecutorService createPipeWriterExecutor() { transport.write(cmd, cmd instanceof CloseCommand); commandsSent++; - lastCommandSent = System.currentTimeMillis(); + lastCommandSentAt = System.currentTimeMillis(); } /** @@ -1209,12 +1209,12 @@ public void dumpPerformanceCounters(PrintWriter w) throws IOException { *

Last command sent *
* The timestamp in which the last command was sent to the other side. The timestamp in which - * {@link #lastCommandSent} was updated. + * {@link #lastCommandSentAt} was updated. * *
Last command received *
* The timestamp in which the last command was sent to the other side. The timestamp in which - * {@link #lastCommandReceived} was updated. + * {@link #lastCommandReceivedAt} was updated. * *
Pending outgoing calls *
@@ -1237,8 +1237,8 @@ public void dumpDiagnostics(PrintWriter w) throws IOException { w.printf(" Created=%s%n", new Date(createdAt)); w.printf(" Commands sent=%d%n", commandsSent); w.printf(" Commands received=%d%n", commandsReceived); - w.printf(" Last command sent=%s%n", new Date(lastCommandSent)); - w.printf(" Last command received=%s%n", new Date(lastCommandReceived)); + w.printf(" Last command sent=%s%n", new Date(lastCommandSentAt)); + w.printf(" Last command received=%s%n", new Date(lastCommandReceivedAt)); w.printf(" Pending outgoing calls=%d%n", pendingCalls.size()); w.printf(" Pending incoming calls=%d%n", pendingCalls.size()); } From d638fb8d0b81eb9b96ce84593b2d587c6c1b085c Mon Sep 17 00:00:00 2001 From: Kohsuke Kawaguchi Date: Fri, 21 Oct 2016 11:38:01 +0200 Subject: [PATCH 5/8] lastHeard and lastCommandReceivedAt are the same thing --- src/main/java/hudson/remoting/Channel.java | 29 +++++++--------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/src/main/java/hudson/remoting/Channel.java b/src/main/java/hudson/remoting/Channel.java index 021199f01..323666842 100644 --- a/src/main/java/hudson/remoting/Channel.java +++ b/src/main/java/hudson/remoting/Channel.java @@ -223,6 +223,14 @@ public class Channel implements VirtualChannel, IChannel, Closeable { /** * Timestamp of the last {@link Command} object sent/received, in * {@link System#currentTimeMillis()} format. + * This can be used as a basis for detecting dead connections. + * + *

+ * Note that {@link #lastCommandSentAt} doesn't mean + * anything in terms of whether the underlying network was able to send + * the data (for example, if the other end of a socket connection goes down + * without telling us anything, the {@link SocketOutputStream#write(int)} will + * return right away, and the socket only really times out after 10s of minutes. */ private long lastCommandSentAt, lastCommandReceivedAt; @@ -292,19 +300,6 @@ public class Channel implements VirtualChannel, IChannel, Closeable { */ public final Capability remoteCapability; - /** - * When did we receive any data from this slave the last time? - * This can be used as a basis for detecting dead connections. - *

- * Note that this doesn't include our sender side of the operation, - * as successfully returning from {@link #send(Command)} doesn't mean - * anything in terms of whether the underlying network was able to send - * the data (for example, if the other end of a socket connection goes down - * without telling us anything, the {@link SocketOutputStream#write(int)} will - * return right away, and the socket only really times out after 10s of minutes. - */ - private volatile long lastHeard; - /** * Single-thread executor for running pipe I/O operations. * @@ -522,7 +517,6 @@ protected Channel(ChannelBuilder settings, CommandTransport transport) throws IO public void handle(Command cmd) { commandsReceived++; lastCommandReceivedAt = System.currentTimeMillis(); - updateLastHeard(); if (logger.isLoggable(Level.FINE)) logger.fine("Received " + cmd); try { @@ -1563,12 +1557,7 @@ public ExportTable.ExportList startExportRecording() { */ public long getLastHeard() { // TODO - this is not safe against clock skew and is called from jenkins core (and potentially plugins) - return lastHeard; - } - - private void updateLastHeard() { - // TODO - this is not safe against clock skew and is called from jenkins core (and potentially plugins) - lastHeard = System.currentTimeMillis(); + return lastCommandReceivedAt; } /*package*/ static Channel setCurrent(Channel channel) { From 9667d4fd20575d8ed99215afc399f9d8a6f73402 Mon Sep 17 00:00:00 2001 From: Kohsuke Kawaguchi Date: Fri, 21 Oct 2016 11:42:40 +0200 Subject: [PATCH 6/8] write to long can be non-atomic without volatile JLS (http://docs.oracle.com/javase/specs/jls/se7/html/jls-17.html#jls-17.7) states that a write to non-volatile long variable can go in 32bit batch, so without it, read could retrieve a completely bogus value. There's no risk of writer contention here because that is serialized by the context in which it gets invoked. --- src/main/java/hudson/remoting/Channel.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/hudson/remoting/Channel.java b/src/main/java/hudson/remoting/Channel.java index 323666842..e9d02fafd 100644 --- a/src/main/java/hudson/remoting/Channel.java +++ b/src/main/java/hudson/remoting/Channel.java @@ -232,7 +232,7 @@ public class Channel implements VirtualChannel, IChannel, Closeable { * without telling us anything, the {@link SocketOutputStream#write(int)} will * return right away, and the socket only really times out after 10s of minutes. */ - private long lastCommandSentAt, lastCommandReceivedAt; + private volatile long lastCommandSentAt, lastCommandReceivedAt; /** * Timestamp of when this channel was connected/created, in From 174168c2d1b7db1f3cd0b64939afa590d233b678 Mon Sep 17 00:00:00 2001 From: Kohsuke Kawaguchi Date: Fri, 21 Oct 2016 12:03:57 +0200 Subject: [PATCH 7/8] CME if new Channel is created during iteration --- src/main/java/hudson/remoting/Channel.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/hudson/remoting/Channel.java b/src/main/java/hudson/remoting/Channel.java index e9d02fafd..7e72fb6fc 100644 --- a/src/main/java/hudson/remoting/Channel.java +++ b/src/main/java/hudson/remoting/Channel.java @@ -1587,7 +1587,7 @@ public static Channel current() { * Used for diagnostics. */ public static void dumpDiagnosticsForAll(PrintWriter w) throws IOException { - for (Ref ref : ACTIVE_CHANNELS.values()) { + for (Ref ref : ACTIVE_CHANNELS.values().toArray(new Ref[0])) { Channel ch = ref.channel(); if (ch!=null) ch.dumpDiagnostics(w); From 07b99b09670dc456777d9a991021f36d21d6b042 Mon Sep 17 00:00:00 2001 From: Kohsuke Kawaguchi Date: Fri, 21 Oct 2016 15:21:58 +0200 Subject: [PATCH 8/8] Added @since --- src/main/java/hudson/remoting/Channel.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/main/java/hudson/remoting/Channel.java b/src/main/java/hudson/remoting/Channel.java index 7e72fb6fc..4aafdcfe6 100644 --- a/src/main/java/hudson/remoting/Channel.java +++ b/src/main/java/hudson/remoting/Channel.java @@ -1225,6 +1225,8 @@ public void dumpPerformanceCounters(PrintWriter w) throws IOException { * The reverse of "pending outgoing calls". * Number of RPC calls (e.g., method call through a {@linkplain RemoteInvocationHandler proxy}) * that the other side has made to us but not yet returned yet. + * + * @since 2.26.3 */ public void dumpDiagnostics(PrintWriter w) throws IOException { w.printf("Channel %s%n",name); @@ -1585,6 +1587,8 @@ public static Channel current() { /** * Calls {@link #dumpDiagnostics(PrintWriter)} across all the active channels in this system. * Used for diagnostics. + * + * @since 2.26.3 */ public static void dumpDiagnosticsForAll(PrintWriter w) throws IOException { for (Ref ref : ACTIVE_CHANNELS.values().toArray(new Ref[0])) {