diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 4d324808817e..685a2e5abf24 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -1516,8 +1516,6 @@ protected void stopServiceThreads() { this.assignmentManager.stop(); } - stopProcedureExecutor(); - if (this.walManager != null) { this.walManager.stop(); } @@ -1565,7 +1563,9 @@ private void startProcedureExecutor() throws IOException { procedureExecutor.startWorkers(); } - private void stopProcedureExecutor() { + @Override + protected void stopProcedureExecutorAndStore() { + super.stopProcedureExecutorAndStore(); if (procedureExecutor != null) { configurationManager.deregisterObserver(procedureExecutor.getEnvironment()); procedureExecutor.getEnvironment().getRemoteDispatcher().stop(); @@ -2968,7 +2968,7 @@ public void abort(String reason, Throwable cause) { } try { - stopMaster(); + stopMaster(false); } catch (IOException e) { LOG.error("Exception occurred while stopping master", e); } @@ -3050,16 +3050,25 @@ public void shutdown() throws IOException { } public void stopMaster() throws IOException { + stopMaster(false); + } + + public void stopMaster(boolean isRpc) throws IOException { if (cpHost != null) { cpHost.preStopMaster(); } - stop("Stopped by " + Thread.currentThread().getName()); + stop("Stopped by " + Thread.currentThread().getName(), isRpc); } @Override public void stop(String msg) { + stop(msg, false); + } + + @Override + public void stop(String msg, boolean isRpc) { if (!isStopped()) { - super.stop(msg); + super.stop(msg, isRpc); if (this.activeMasterManager != null) { this.activeMasterManager.stop(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java index cd017d841b1f..f5a546eb5718 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java @@ -1551,7 +1551,7 @@ public StopMasterResponse stopMaster(RpcController controller, StopMasterRequest request) throws ServiceException { LOG.info(master.getClientIdAuditPrefix() + " stop"); try { - master.stopMaster(); + master.stopMaster(true); } catch (IOException e) { LOG.error("Exception occurred while stopping master", e); throw new ServiceException(e); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CloneSnapshotProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CloneSnapshotProcedure.java index d351d675384d..8bf1baceeba3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CloneSnapshotProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CloneSnapshotProcedure.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; +import java.io.InterruptedIOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -195,6 +196,11 @@ protected Flow executeFromState(final MasterProcedureEnv env, final CloneSnapsho throw new UnsupportedOperationException("unhandled state=" + state); } } catch (IOException e) { + // Check for a specific class; IIoEx is thrown for interrupted thread but not its subclasses. + // Handle interrupted IO; master is probably shutting down, no reason to retry. + if (e.getClass() == InterruptedIOException.class) { + throw new InterruptedException(e.getMessage()); + } if (isRollbackSupported(state)) { setFailure("master-clone-snapshot", e); } else { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateNamespaceProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateNamespaceProcedure.java index 28f75859c8c6..346c4fb4cb8e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateNamespaceProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateNamespaceProcedure.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; +import java.io.InterruptedIOException; import org.apache.hadoop.hbase.NamespaceDescriptor; import org.apache.hadoop.hbase.NamespaceExistException; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; @@ -87,6 +88,11 @@ protected Flow executeFromState(final MasterProcedureEnv env, final CreateNamesp throw new UnsupportedOperationException(this + " unhandled state=" + state); } } catch (IOException e) { + // Check for a specific class; IIoEx is thrown for interrupted thread but not its subclasses. + // Handle interrupted IO; master is probably shutting down, no reason to retry. + if (e.getClass() == InterruptedIOException.class) { + throw new InterruptedException(e.getMessage()); + } if (isRollbackSupported(state)) { setFailure("master-create-namespace", e); } else { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java index 34fde27d03c0..fcddaab40d67 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; +import java.io.InterruptedIOException; import java.util.ArrayList; import java.util.List; @@ -123,6 +124,11 @@ protected Flow executeFromState(final MasterProcedureEnv env, final CreateTableS throw new UnsupportedOperationException("unhandled state=" + state); } } catch (IOException e) { + // Check for a specific class; IIoEx is thrown for interrupted thread but not its subclasses. + // Handle interrupted IO; master is probably shutting down, no reason to retry. + if (e.getClass() == InterruptedIOException.class) { + throw new InterruptedException(e.getMessage()); + } if (isRollbackSupported(state)) { setFailure("master-create-table", e); } else { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteNamespaceProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteNamespaceProcedure.java index d3749a2300fe..1f2d8d64db3f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteNamespaceProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteNamespaceProcedure.java @@ -19,6 +19,8 @@ import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InterruptedIOException; + import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -95,6 +97,11 @@ protected Flow executeFromState(MasterProcedureEnv env, DeleteNamespaceState sta throw new UnsupportedOperationException(this + " unhandled state=" + state); } } catch (IOException e) { + // Check for a specific class; IIoEx is thrown for interrupted thread but not its subclasses. + // Handle interrupted IO; master is probably shutting down, no reason to retry. + if (e.getClass() == InterruptedIOException.class) { + throw new InterruptedException(e.getMessage()); + } if (isRollbackSupported(state)) { setFailure("master-delete-namespace", e); } else { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java index 4325d19569b2..4d7b7e8840ea 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; +import java.io.InterruptedIOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -135,6 +136,11 @@ protected Flow executeFromState(final MasterProcedureEnv env, DeleteTableState s throw new UnsupportedOperationException("unhandled state=" + state); } } catch (IOException e) { + // Check for a specific class; IIoEx is thrown for interrupted thread but not its subclasses. + // Handle interrupted IO; master is probably shutting down, no reason to retry. + if (e.getClass() == InterruptedIOException.class) { + throw new InterruptedException(e.getMessage()); + } if (isRollbackSupported(state)) { setFailure("master-delete-table", e); } else { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java index 3994304d4239..a0b76c96799e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; +import java.io.InterruptedIOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.hbase.Cell; @@ -180,6 +181,11 @@ protected Flow executeFromState(final MasterProcedureEnv env, final EnableTableS throw new UnsupportedOperationException("unhandled state=" + state); } } catch (IOException e) { + // Check for a specific class; IIoEx is thrown for interrupted thread but not its subclasses. + // Handle interrupted IO; master is probably shutting down, no reason to retry. + if (e.getClass() == InterruptedIOException.class) { + throw new InterruptedException(e.getMessage()); + } if (isRollbackSupported(state)) { setFailure("master-enable-table", e); } else { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyNamespaceProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyNamespaceProcedure.java index e3327e200c9c..535147cc711a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyNamespaceProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyNamespaceProcedure.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; +import java.io.InterruptedIOException; import org.apache.hadoop.hbase.NamespaceDescriptor; import org.apache.hadoop.hbase.NamespaceNotFoundException; import org.apache.hadoop.hbase.constraint.ConstraintException; @@ -82,6 +83,11 @@ protected Flow executeFromState(final MasterProcedureEnv env, final ModifyNamesp throw new UnsupportedOperationException(this + " unhandled state=" + state); } } catch (IOException e) { + // Check for a specific class; IIoEx is thrown for interrupted thread but not its subclasses. + // Handle interrupted IO; master is probably shutting down, no reason to retry. + if (e.getClass() == InterruptedIOException.class) { + throw new InterruptedException(e.getMessage()); + } if (isRollbackSupported(state)) { setFailure("master-modify-namespace", e); } else { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java index dd834db2fe8b..dc065682ca67 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; +import java.io.InterruptedIOException; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -136,6 +137,11 @@ protected Flow executeFromState(final MasterProcedureEnv env, final ModifyTableS throw new UnsupportedOperationException("unhandled state=" + state); } } catch (IOException e) { + // Check for a specific class; IIoEx is thrown for interrupted thread but not its subclasses. + // Handle interrupted IO; master is probably shutting down, no reason to retry. + if (e.getClass() == InterruptedIOException.class) { + throw new InterruptedException(e.getMessage()); + } if (isRollbackSupported(state)) { setFailure("master-modify-table", e); } else { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RestoreSnapshotProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RestoreSnapshotProcedure.java index f542449b6915..51fce750a2c5 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RestoreSnapshotProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RestoreSnapshotProcedure.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; +import java.io.InterruptedIOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -161,6 +162,9 @@ protected Flow executeFromState(final MasterProcedureEnv env, final RestoreSnaps default: throw new UnsupportedOperationException("unhandled state=" + state); } + } catch (InterruptedIOException e) { + // Handle interrupted IO; master is probably shutting down, no reason to retry. + throw new InterruptedException(e.getMessage()); } catch (IOException e) { if (isRollbackSupported(state)) { setFailure("master-restore-snapshot", e); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateTableProcedure.java index 52da607ef835..2db313d6f8b6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateTableProcedure.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; +import java.io.InterruptedIOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -149,6 +150,11 @@ protected Flow executeFromState(final MasterProcedureEnv env, TruncateTableState throw new UnsupportedOperationException("unhandled state=" + state); } } catch (IOException e) { + // Check for a specific class; IIoEx is thrown for interrupted thread but not its subclasses. + // Handle interrupted IO; master is probably shutting down, no reason to retry. + if (e.getClass() == InterruptedIOException.class) { + throw new InterruptedException(e.getMessage()); + } if (isRollbackSupported(state)) { setFailure("master-truncate-table", e); } else { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index bcbc78b27b50..1d687be2d7d6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -314,7 +314,9 @@ public class HRegionServer extends HasThread implements // Set when a report to the master comes back with a message asking us to // shutdown. Also set by call to stop when debugging or running unit tests // of HRegionServer in isolation. + // TODO: both of those are not protected by any lock... private volatile boolean stopped = false; + private volatile boolean stoppedFromRpc = false; // Go down hard. Used if file system becomes unavailable and also in // debugging and unit tests. @@ -580,6 +582,7 @@ public HRegionServer(Configuration conf) throws IOException { this.abortRequested = false; this.stopped = false; + this.stoppedFromRpc = false; rpcServices = createRpcServices(); useThisHostnameInstead = getUseThisHostnameInstead(conf); @@ -1062,6 +1065,27 @@ public void run() { } } + // First, shut down RCP services so we minimize race potential if we respond to requests + // in the middle of abort, from partial/invalid state. + if (this.rpcServices != null) { + // Note: if we were stopped from an RPC message, this is a race, but previously we'd have + // enough time to respond to that message while all the other stuff was shutting down. + // We don't have that artificial wait anymore, so if stopped from RCP wait explicitly. + // It's still a race so the client may receive a disconnect. + // There seems to be no way to wait for a particular response to be sent out... + if (stoppedFromRpc) { + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + // Ignore, we will complete shutdown. + } + } + this.rpcServices.stop(); + } + + // Then, shut down procedures for the same reason; avoid persisting procedures in bad state. + stopProcedureExecutorAndStore(); + if (this.leases != null) { this.leases.closeAfterLeasesExpire(); } @@ -1093,7 +1117,6 @@ public void run() { if (this.hMemManager != null) this.hMemManager.stop(); if (this.cacheFlusher != null) this.cacheFlusher.interruptIfNecessary(); if (this.compactSplitThread != null) this.compactSplitThread.interruptIfNecessary(); - sendShutdownInterrupt(); // Stop the snapshot and other procedure handlers, forcefully killing all running tasks if (rspmHost != null) { @@ -1173,10 +1196,6 @@ public void run() { stopServiceThreads(); } - if (this.rpcServices != null) { - this.rpcServices.stop(); - } - try { deleteMyEphemeralNode(); } catch (KeeperException.NoNodeException nn) { @@ -2187,7 +2206,11 @@ public ClusterConnection getClusterConnection() { @Override public void stop(final String msg) { - stop(msg, false, RpcServer.getRequestUser().orElse(null)); + stop(msg, false); + } + + public void stop(final String msg, boolean isRpc) { + stop(msg, false, RpcServer.getRequestUser().orElse(null), isRpc); } /** @@ -2196,7 +2219,7 @@ public void stop(final String msg) { * @param force True if this is a regionserver abort * @param user The user executing the stop request, or null if no user is associated */ - public void stop(final String msg, final boolean force, final User user) { + public void stop(final String msg, final boolean force, final User user, boolean isRpc) { if (!this.stopped) { LOG.info("***** STOPPING region server '" + this + "' *****"); if (this.rsHost != null) { @@ -2211,6 +2234,7 @@ public void stop(final String msg, final boolean force, final User user) { LOG.warn("Skipping coprocessor exception on preStop() due to forced shutdown", ioe); } } + this.stoppedFromRpc = isRpc || this.stoppedFromRpc; this.stopped = true; LOG.info("STOPPED: " + msg); // Wakes run() if it is sleeping @@ -2400,6 +2424,10 @@ public RSRpcServices getRSRpcServices() { return rpcServices; } + protected void stopProcedureExecutorAndStore() { + // No-op in RS; there's no procedure store so we don't have to shut the executor down early. + } + /** * Cause the server to exit without closing the regions it is serving, the log * it is using and without notifying the master. Used unit testing and on @@ -2449,7 +2477,7 @@ public void abort(String reason, Throwable cause) { LOG.warn("Unable to report fatal error to master", t); } // shutdown should be run as the internal user - stop(reason, true, null); + stop(reason, true, null, false); } /** @@ -2475,12 +2503,6 @@ protected void kill() { abort("Simulated kill"); } - /** - * Called on stop/abort before closing the cluster connection and meta locator. - */ - protected void sendShutdownInterrupt() { - } - /** * Wait on all threads to finish. Presumption is that all closes and stops * have already been called. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java index 688c03d678f1..80cfee51293b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java @@ -2355,8 +2355,7 @@ public RollWALWriterResponse rollWALWriter(final RpcController controller, public StopServerResponse stopServer(final RpcController controller, final StopServerRequest request) throws ServiceException { requestCount.increment(); - String reason = request.getReason(); - regionServer.stop(reason); + regionServer.stop(request.getReason(), true); return StopServerResponse.newBuilder().build(); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java index 878ca751e80e..ff7e991edf88 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java @@ -127,7 +127,7 @@ public void tearDown() throws Exception { * Test that a regionserver is able to abort properly, even when a coprocessor * throws an exception in preStopRegionServer(). */ - @Test + @Test(timeout = 20000L) public void testAbortFromRPC() throws Exception { TableName tableName = TableName.valueOf("testAbortFromRPC"); // create a test table @@ -147,14 +147,26 @@ public void testAbortFromRPC() throws Exception { List regions = cluster.findRegionsForTable(tableName); HRegion firstRegion = cluster.findRegionsForTable(tableName).get(0); - table.put(put); - // Verify that the regionserver is stopped + + // Shutting down RS will shut down RPC endpoint before a put can return, in most cases. + // So, don't wait for the put to finish. + Thread t = new Thread(() -> { + try { + table.put(put); + } catch (IOException e) { + LOG.info("Put failed", e); + } + }); + t.setDaemon(true); + t.start(); + + // Wait for the regionserver is stopped assertNotNull(firstRegion); - assertNotNull(firstRegion.getRegionServerServices()); - LOG.info("isAborted = " + firstRegion.getRegionServerServices().isAborted()); - assertTrue(firstRegion.getRegionServerServices().isAborted()); - LOG.info("isStopped = " + firstRegion.getRegionServerServices().isStopped()); - assertTrue(firstRegion.getRegionServerServices().isStopped()); + RegionServerServices rss = firstRegion.getRegionServerServices(); + assertNotNull(rss); + while (!rss.isAborted() || !rss.isStopped()) { + Thread.sleep(100); + } } /**