From 67b20fdd9dee8c244e3ddb05e992031f664fc747 Mon Sep 17 00:00:00 2001 From: Ruanhui <32773751+frostruan@users.noreply.github.com> Date: Fri, 28 Jul 2023 23:18:07 +0800 Subject: [PATCH] HBASE-27988 NPE in AddPeerProcedure recovery (#5331) Co-authored-by: huiruan <876107431@qq.com> Signed-off-by: Duo Zhang --- .../java/org/apache/hadoop/hbase/master/HMaster.java | 9 +++++++++ .../apache/hadoop/hbase/master/MasterServices.java | 6 ++++++ .../hbase/master/replication/AddPeerProcedure.java | 6 +++--- .../AssignReplicationQueuesProcedure.java | 2 +- .../master/replication/ReplicationPeerManager.java | 12 ------------ .../hadoop/hbase/master/MockNoopMasterServices.java | 6 ++++++ 6 files changed, 25 insertions(+), 16 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 1c77e8dfaafa..1b5291491503 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -54,6 +54,7 @@ import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; +import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; @@ -368,6 +369,9 @@ public class HMaster extends HBaseServerBase implements Maste private final ReplicationLogCleanerBarrier replicationLogCleanerBarrier = new ReplicationLogCleanerBarrier(); + // Only allow to add one sync replication peer concurrently + private final Semaphore syncReplicationPeerLock = new Semaphore(1); + // manager of replication private ReplicationPeerManager replicationPeerManager; @@ -4115,6 +4119,11 @@ public ReplicationLogCleanerBarrier getReplicationLogCleanerBarrier() { return replicationLogCleanerBarrier; } + @Override + public Semaphore getSyncReplicationPeerLock() { + return syncReplicationPeerLock; + } + public HashMap>> getReplicationLoad(ServerName[] serverNames) { List peerList = this.getReplicationPeerManager().listPeers(null); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java index d450fbb45ac0..95166240c789 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.List; +import java.util.concurrent.Semaphore; import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableDescriptors; @@ -368,6 +369,11 @@ ReplicationPeerConfig getReplicationPeerConfig(String peerId) */ ReplicationLogCleanerBarrier getReplicationLogCleanerBarrier(); + /** + * Returns the SyncReplicationPeerLock. + */ + Semaphore getSyncReplicationPeerLock(); + /** * Returns the {@link SyncReplicationReplayWALManager}. */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AddPeerProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AddPeerProcedure.java index c469896d3e7d..8f8bdd63ea30 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AddPeerProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AddPeerProcedure.java @@ -89,7 +89,7 @@ protected void releaseLatch(MasterProcedureEnv env) { env.getMasterServices().getReplicationLogCleanerBarrier().enable(); } if (peerConfig.isSyncReplication()) { - env.getReplicationPeerManager().releaseSyncReplicationPeerLock(); + env.getMasterServices().getSyncReplicationPeerLock().release(); } super.releaseLatch(env); } @@ -108,7 +108,7 @@ protected void prePeerModification(MasterProcedureEnv env) cpHost.preAddReplicationPeer(peerId, peerConfig); } if (peerConfig.isSyncReplication()) { - if (!env.getReplicationPeerManager().tryAcquireSyncReplicationPeerLock()) { + if (!env.getMasterServices().getSyncReplicationPeerLock().tryAcquire()) { throw suspend(env.getMasterConfiguration(), backoff -> LOG.warn( "Can not acquire sync replication peer lock for peer {}, sleep {} secs", peerId, @@ -147,7 +147,7 @@ protected void afterReplay(MasterProcedureEnv env) { } cleanerDisabled = true; if (peerConfig.isSyncReplication()) { - if (!env.getReplicationPeerManager().tryAcquireSyncReplicationPeerLock()) { + if (!env.getMasterServices().getSyncReplicationPeerLock().tryAcquire()) { throw new IllegalStateException( "Can not acquire sync replication peer lock for peer " + peerId); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java index b547c87009dd..298b40d357f0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java @@ -186,7 +186,7 @@ protected Flow executeFromState(MasterProcedureEnv env, AssignReplicationQueuesS retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration()); } long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); - LOG.warn("Failed to claim replication queues for {}, suspend {}secs {}; {};", crashedServer, + LOG.warn("Failed to claim replication queues for {}, suspend {} secs", crashedServer, backoff / 1000, e); setTimeout(Math.toIntExact(backoff)); setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java index 53a7a6f00146..988c519f781d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java @@ -32,7 +32,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutorService; -import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -111,9 +110,6 @@ public class ReplicationPeerManager implements ConfigurationObserver { SyncReplicationState.DOWNGRADE_ACTIVE, EnumSet.of(SyncReplicationState.STANDBY, SyncReplicationState.ACTIVE))); - // Only allow to add one sync replication peer concurrently - private final Semaphore syncReplicationPeerLock = new Semaphore(1); - private final String clusterId; private volatile Configuration conf; @@ -713,14 +709,6 @@ private boolean isStringEquals(String s1, String s2) { return s1.equals(s2); } - public boolean tryAcquireSyncReplicationPeerLock() { - return syncReplicationPeerLock.tryAcquire(); - } - - public void releaseSyncReplicationPeerLock() { - syncReplicationPeerLock.release(); - } - @Override public void onConfigurationChange(Configuration conf) { this.conf = conf; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java index d526358ceb4e..c82220a8b22a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.List; +import java.util.concurrent.Semaphore; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hbase.ChoreService; @@ -530,4 +531,9 @@ public boolean isReplicationPeerModificationEnabled() { public ReplicationLogCleanerBarrier getReplicationLogCleanerBarrier() { return null; } + + @Override + public Semaphore getSyncReplicationPeerLock() { + return null; + } }