From 1ff0b083ab7b2ed5075ca05cebedd6d292954c9f Mon Sep 17 00:00:00 2001 From: Wellington Chevreuil Date: Mon, 9 Jan 2023 10:38:14 +0000 Subject: [PATCH] Revert "HBASE-27551 Add config options to delay assignment to retain last region location" This reverts commit 87fc1435c75fb9fb0e2db15922beb7a7b6c5bc64. --- .../hadoop/hbase/master/ServerManager.java | 2 +- .../TransitRegionStateProcedure.java | 61 +-------------- .../master/TestRetainAssignmentOnRestart.java | 75 ------------------- 3 files changed, 2 insertions(+), 136 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 6a169beb53bf..ed28db78de71 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -406,7 +406,7 @@ private void checkIsDead(final ServerName serverName, final String what) * Assumes onlineServers is locked. * @return ServerName with matching hostname and port. */ - public ServerName findServerWithSameHostnamePortWithLock(final ServerName serverName) { + private ServerName findServerWithSameHostnamePortWithLock(final ServerName serverName) { ServerName end = ServerName.valueOf(serverName.getHostname(), serverName.getPort(), Long.MAX_VALUE); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index 65431db3561a..72ac2d3827fd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -31,7 +31,6 @@ import org.apache.hadoop.hbase.client.RetriesExhaustedException; import org.apache.hadoop.hbase.master.MetricsAssignmentManager; import org.apache.hadoop.hbase.master.RegionState.State; -import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; @@ -108,20 +107,6 @@ public class TransitRegionStateProcedure private static final Logger LOG = LoggerFactory.getLogger(TransitRegionStateProcedure.class); - public static final String FORCE_REGION_RETAINMENT = "hbase.master.scp.retain.assignment.force"; - - public static final boolean DEFAULT_FORCE_REGION_RETAINMENT = false; - - public static final String FORCE_REGION_RETAINMENT_WAIT = - "hbase.master.scp.retain.assignment.force.wait"; - - public static final long DEFAULT_FORCE_REGION_RETAINMENT_WAIT = 500; - - public static final String FORCE_REGION_RETAINMENT_RETRIES = - "hbase.master.scp.retain.assignment.force.retries"; - - public static final long DEFAULT_FORCE_REGION_RETAINMENT_RETRIES = 600; - private TransitionType type; private RegionStateTransitionState initialState; @@ -141,14 +126,6 @@ public class TransitRegionStateProcedure private boolean isSplit; - private boolean forceRegionRetainment; - - private ServerManager serverManager; - - private long forceRegionRetainmentWait; - - private long forceRegionRetainmentRetries; - public TransitRegionStateProcedure() { } @@ -186,17 +163,6 @@ protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri, } evictCache = env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_CLOSE_KEY, DEFAULT_EVICT_ON_CLOSE); - - forceRegionRetainment = env.getMasterConfiguration().getBoolean(FORCE_REGION_RETAINMENT, - DEFAULT_FORCE_REGION_RETAINMENT); - - forceRegionRetainmentWait = env.getMasterConfiguration().getLong(FORCE_REGION_RETAINMENT_WAIT, - DEFAULT_FORCE_REGION_RETAINMENT_WAIT); - - forceRegionRetainmentRetries = env.getMasterConfiguration() - .getLong(FORCE_REGION_RETAINMENT_RETRIES, DEFAULT_FORCE_REGION_RETAINMENT_RETRIES); - - serverManager = env.getMasterServices().getServerManager(); } protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri, @@ -222,25 +188,6 @@ protected boolean waitInitialized(MasterProcedureEnv env) { return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, getRegion()); } - private void checkAndWaitForOriginalServer(ServerName lastHost) - throws ProcedureSuspendedException { - boolean isOnline = serverManager.findServerWithSameHostnamePortWithLock(lastHost) != null; - long retries = 0; - while (!isOnline && retries < forceRegionRetainmentRetries) { - try { - Thread.sleep(forceRegionRetainmentWait); - } catch (InterruptedException e) { - throw new ProcedureSuspendedException(); - } - retries++; - isOnline = serverManager.findServerWithSameHostnamePortWithLock(lastHost) != null; - } - LOG.info( - "{} is true. We waited {} ms for host {} to come back online. " - + "Did host come back online? {}", - FORCE_REGION_RETAINMENT, (retries * forceRegionRetainmentRetries), lastHost, isOnline); - } - private void queueAssign(MasterProcedureEnv env, RegionStateNode regionNode) throws ProcedureSuspendedException { boolean retain = false; @@ -253,15 +200,9 @@ private void queueAssign(MasterProcedureEnv env, RegionStateNode regionNode) regionNode.setRegionLocation(assignCandidate); } else if (regionNode.getLastHost() != null) { retain = true; - LOG.info("Setting lastHost {} as the location for region {}", regionNode.getLastHost(), - regionNode.getRegionInfo().getEncodedName()); + LOG.info("Setting lastHost as the region location {}", regionNode.getLastHost()); regionNode.setRegionLocation(regionNode.getLastHost()); } - if (regionNode.getRegionLocation() != null && forceRegionRetainment) { - LOG.warn("{} is set to true. This may delay regions re-assignment " - + "upon RegionServers crashes or restarts.", FORCE_REGION_RETAINMENT); - checkAndWaitForOriginalServer(regionNode.getRegionLocation()); - } } LOG.info("Starting {}; {}; forceNewPlan={}, retain={}", this, regionNode.toShortString(), forceNewPlan, retain); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestRetainAssignmentOnRestart.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestRetainAssignmentOnRestart.java index c846be916971..641a07315ff2 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestRetainAssignmentOnRestart.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestRetainAssignmentOnRestart.java @@ -17,8 +17,6 @@ */ package org.apache.hadoop.hbase.master; -import static org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure.FORCE_REGION_RETAINMENT; -import static org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure.FORCE_REGION_RETAINMENT_WAIT; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; @@ -230,79 +228,6 @@ public void testRetainAssignmentOnSingleRSRestart() throws Exception { } } - /** - * This tests the force retaining assignments upon an RS restart, even when master triggers an SCP - */ - @Test - public void testForceRetainAssignment() throws Exception { - UTIL.getConfiguration().setBoolean(FORCE_REGION_RETAINMENT, true); - UTIL.getConfiguration().setLong(FORCE_REGION_RETAINMENT_WAIT, 2000); - setupCluster(); - HMaster master = UTIL.getMiniHBaseCluster().getMaster(); - SingleProcessHBaseCluster cluster = UTIL.getHBaseCluster(); - List threads = cluster.getLiveRegionServerThreads(); - assertEquals(NUM_OF_RS, threads.size()); - int[] rsPorts = new int[NUM_OF_RS]; - for (int i = 0; i < NUM_OF_RS; i++) { - rsPorts[i] = threads.get(i).getRegionServer().getServerName().getPort(); - } - - // We don't have to use SnapshotOfRegionAssignmentFromMeta. We use it here because AM used to - // use it to load all user region placements - SnapshotOfRegionAssignmentFromMeta snapshot = - new SnapshotOfRegionAssignmentFromMeta(master.getConnection()); - snapshot.initialize(); - Map regionToRegionServerMap = snapshot.getRegionToRegionServerMap(); - for (ServerName serverName : regionToRegionServerMap.values()) { - boolean found = false; // Test only, no need to optimize - for (int k = 0; k < NUM_OF_RS && !found; k++) { - found = serverName.getPort() == rsPorts[k]; - } - assertTrue(found); - } - - // Server to be restarted - ServerName deadRS = threads.get(0).getRegionServer().getServerName(); - List deadRSRegions = snapshot.getRegionServerToRegionMap().get(deadRS); - LOG.info("\n\nStopping {} server", deadRS); - cluster.stopRegionServer(deadRS); - - LOG.info("\n\nSleeping a bit"); - Thread.sleep(1000); - - LOG.info("\n\nStarting region server {} second time with the same port", deadRS); - cluster.getConf().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 3); - cluster.getConf().setInt(HConstants.REGIONSERVER_PORT, deadRS.getPort()); - cluster.startRegionServer(); - - ensureServersWithSamePort(master, rsPorts); - - // Wait till master is initialized and all regions are assigned - for (TableName TABLE : TABLES) { - UTIL.waitTableAvailable(TABLE); - } - UTIL.waitUntilNoRegionsInTransition(60000); - - snapshot = new SnapshotOfRegionAssignmentFromMeta(master.getConnection()); - snapshot.initialize(); - Map newRegionToRegionServerMap = snapshot.getRegionToRegionServerMap(); - assertEquals(regionToRegionServerMap.size(), newRegionToRegionServerMap.size()); - for (Map.Entry entry : newRegionToRegionServerMap.entrySet()) { - ServerName oldServer = regionToRegionServerMap.get(entry.getKey()); - ServerName currentServer = entry.getValue(); - LOG.info( - "Key=" + entry.getKey() + " oldServer=" + oldServer + ", currentServer=" + currentServer); - assertEquals(entry.getKey().toString(), oldServer.getAddress(), currentServer.getAddress()); - - if (deadRS.getPort() == oldServer.getPort()) { - // Restarted RS start code wont be same - assertNotEquals(oldServer.getStartcode(), currentServer.getStartcode()); - } else { - assertEquals(oldServer.getStartcode(), currentServer.getStartcode()); - } - } - } - private void setupCluster() throws Exception, IOException, InterruptedException { // Set Zookeeper based connection registry since we will stop master and start a new master // without populating the underlying config for the connection.