From d4ce6e6ad72f46528db38319b8c7eebaaa197389 Mon Sep 17 00:00:00 2001 From: "lvhaiping.lhp" Date: Thu, 28 Sep 2023 10:24:28 +0800 Subject: [PATCH 1/4] To prevent threads from being blocked by the lock of RegionStateNode, modify the way of acquiring the RegionStateNode lock in checkOnlineRegionsReport to tryLock. --- .../master/assignment/AssignmentManager.java | 46 +++++++++++-------- .../master/assignment/RegionStateNode.java | 5 ++ 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index f2cfad87997c..af5a8195caf0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1398,27 +1398,37 @@ private void checkOnlineRegionsReport(ServerStateNode serverNode, Set re continue; } final long lag = 1000; - regionNode.lock(); try { - long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate(); - if (regionNode.isInState(State.OPENING, State.OPEN)) { - // This is possible as a region server has just closed a region but the region server - // report is generated before the closing, but arrive after the closing. Make sure there - // is some elapsed time so less false alarms. - if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) { - LOG.warn("Reporting {} server does not match {} (time since last " - + "update={}ms); closing...", serverName, regionNode, diff); - closeRegionSilently(serverNode.getServerName(), regionName); - } - } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) { - // So, we can get report that a region is CLOSED or SPLIT because a heartbeat - // came in at about same time as a region transition. Make sure there is some - // elapsed time so less false alarms. - if (diff > lag) { - LOG.warn("Reporting {} state does not match {} (time since last update={}ms)", - serverName, regionNode, diff); + // It is likely that another thread is currently holding the lock. To avoid deadlock, use + // tryLock waiting for a specified period of time + if (regionNode.tryLock(10, TimeUnit.SECONDS)) { + long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate(); + if (regionNode.isInState(State.OPENING, State.OPEN)) { + // This is possible as a region server has just closed a region but the region server + // report is generated before the closing, but arrive after the closing. Make sure there + // is some elapsed time so less false alarms. + if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) { + LOG.warn("Reporting {} server does not match {} (time since last " + + "update={}ms); closing...", serverName, regionNode, diff); + closeRegionSilently(serverNode.getServerName(), regionName); + } + } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) { + // So, we can get report that a region is CLOSED or SPLIT because a heartbeat + // came in at about same time as a region transition. Make sure there is some + // elapsed time so less false alarms. + if (diff > lag) { + LOG.warn("Reporting {} state does not match {} (time since last update={}ms)", + serverName, regionNode, diff); + } } + } else { + LOG.warn( + "Unable to acquire lock for regionNode {}. It is likely that another thread is currently holding the lock. To avoid deadlock, skip execution for now.", + regionNode); + continue; } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); } finally { regionNode.unlock(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java index 3856ce227ba1..e9e46bfb54bf 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java @@ -19,6 +19,7 @@ import java.util.Arrays; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import org.apache.hadoop.hbase.HConstants; @@ -323,6 +324,10 @@ public void lock() { lock.lock(); } + public boolean tryLock(long time, TimeUnit unit) throws InterruptedException { + return lock.tryLock(time, unit); + } + public void unlock() { lock.unlock(); } From 5d5ff0a2a9ba15d5496494cb544eb6e31da2af18 Mon Sep 17 00:00:00 2001 From: "lvhaiping.lhp" Date: Sat, 7 Oct 2023 16:41:22 +0800 Subject: [PATCH 2/4] Adjust the position of tryLock --- .../master/assignment/AssignmentManager.java | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index af5a8195caf0..c41476aafa9a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1402,35 +1402,37 @@ private void checkOnlineRegionsReport(ServerStateNode serverNode, Set re // It is likely that another thread is currently holding the lock. To avoid deadlock, use // tryLock waiting for a specified period of time if (regionNode.tryLock(10, TimeUnit.SECONDS)) { - long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate(); - if (regionNode.isInState(State.OPENING, State.OPEN)) { - // This is possible as a region server has just closed a region but the region server - // report is generated before the closing, but arrive after the closing. Make sure there - // is some elapsed time so less false alarms. - if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) { - LOG.warn("Reporting {} server does not match {} (time since last " - + "update={}ms); closing...", serverName, regionNode, diff); - closeRegionSilently(serverNode.getServerName(), regionName); - } - } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) { - // So, we can get report that a region is CLOSED or SPLIT because a heartbeat - // came in at about same time as a region transition. Make sure there is some - // elapsed time so less false alarms. - if (diff > lag) { - LOG.warn("Reporting {} state does not match {} (time since last update={}ms)", - serverName, regionNode, diff); + try { + long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate(); + if (regionNode.isInState(State.OPENING, State.OPEN)) { + // This is possible as a region server has just closed a region but the region server + // report is generated before the closing, but arrive after the closing. Make sure + // there + // is some elapsed time so less false alarms. + if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) { + LOG.warn("Reporting {} server does not match {} (time since last " + + "update={}ms); closing...", serverName, regionNode, diff); + closeRegionSilently(serverNode.getServerName(), regionName); + } + } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) { + // So, we can get report that a region is CLOSED or SPLIT because a heartbeat + // came in at about same time as a region transition. Make sure there is some + // elapsed time so less false alarms. + if (diff > lag) { + LOG.warn("Reporting {} state does not match {} (time since last update={}ms)", + serverName, regionNode, diff); + } } + } finally { + regionNode.unlock(); } } else { LOG.warn( "Unable to acquire lock for regionNode {}. It is likely that another thread is currently holding the lock. To avoid deadlock, skip execution for now.", regionNode); - continue; } } catch (InterruptedException e) { Thread.currentThread().interrupt(); - } finally { - regionNode.unlock(); } } } From b15c17854bcaf60fc4a119f42248832dc52f7213 Mon Sep 17 00:00:00 2001 From: "lvhaiping.lhp" Date: Mon, 16 Oct 2023 10:36:55 +0800 Subject: [PATCH 3/4] Use tryLock without any timeout then it will not throw any exceptions --- .../master/assignment/AssignmentManager.java | 60 +++++++++---------- .../master/assignment/RegionStateNode.java | 5 +- 2 files changed, 30 insertions(+), 35 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index c41476aafa9a..d83a49ffd714 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1398,41 +1398,37 @@ private void checkOnlineRegionsReport(ServerStateNode serverNode, Set re continue; } final long lag = 1000; - try { - // It is likely that another thread is currently holding the lock. To avoid deadlock, use - // tryLock waiting for a specified period of time - if (regionNode.tryLock(10, TimeUnit.SECONDS)) { - try { - long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate(); - if (regionNode.isInState(State.OPENING, State.OPEN)) { - // This is possible as a region server has just closed a region but the region server - // report is generated before the closing, but arrive after the closing. Make sure - // there - // is some elapsed time so less false alarms. - if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) { - LOG.warn("Reporting {} server does not match {} (time since last " - + "update={}ms); closing...", serverName, regionNode, diff); - closeRegionSilently(serverNode.getServerName(), regionName); - } - } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) { - // So, we can get report that a region is CLOSED or SPLIT because a heartbeat - // came in at about same time as a region transition. Make sure there is some - // elapsed time so less false alarms. - if (diff > lag) { - LOG.warn("Reporting {} state does not match {} (time since last update={}ms)", - serverName, regionNode, diff); - } + // It is likely that another thread is currently holding the lock. To avoid deadlock, use + // tryLock waiting for a specified period of time + if (regionNode.tryLock()) { + try { + long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate(); + if (regionNode.isInState(State.OPENING, State.OPEN)) { + // This is possible as a region server has just closed a region but the region server + // report is generated before the closing, but arrive after the closing. Make sure + // there + // is some elapsed time so less false alarms. + if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) { + LOG.warn("Reporting {} server does not match {} (time since last " + + "update={}ms); closing...", serverName, regionNode, diff); + closeRegionSilently(serverNode.getServerName(), regionName); + } + } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) { + // So, we can get report that a region is CLOSED or SPLIT because a heartbeat + // came in at about same time as a region transition. Make sure there is some + // elapsed time so less false alarms. + if (diff > lag) { + LOG.warn("Reporting {} state does not match {} (time since last update={}ms)", + serverName, regionNode, diff); } - } finally { - regionNode.unlock(); } - } else { - LOG.warn( - "Unable to acquire lock for regionNode {}. It is likely that another thread is currently holding the lock. To avoid deadlock, skip execution for now.", - regionNode); + } finally { + regionNode.unlock(); } - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); + } else { + LOG.warn( + "Unable to acquire lock for regionNode {}. It is likely that another thread is currently holding the lock. To avoid deadlock, skip execution for now.", + regionNode); } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java index e9e46bfb54bf..91c0222facd1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java @@ -19,7 +19,6 @@ import java.util.Arrays; import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import org.apache.hadoop.hbase.HConstants; @@ -324,8 +323,8 @@ public void lock() { lock.lock(); } - public boolean tryLock(long time, TimeUnit unit) throws InterruptedException { - return lock.tryLock(time, unit); + public boolean tryLock() { + return lock.tryLock(); } public void unlock() { From 756add1d55db1bb73f5087695266da9cb5a2242a Mon Sep 17 00:00:00 2001 From: "lvhaiping.lhp" Date: Tue, 17 Oct 2023 00:04:23 +0800 Subject: [PATCH 4/4] Modifying comments for regionNode.tryLock --- .../hadoop/hbase/master/assignment/AssignmentManager.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index d83a49ffd714..789a6e2ca89d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1398,8 +1398,10 @@ private void checkOnlineRegionsReport(ServerStateNode serverNode, Set re continue; } final long lag = 1000; - // It is likely that another thread is currently holding the lock. To avoid deadlock, use - // tryLock waiting for a specified period of time + // This is just a fallback check designed to identify unexpected data inconsistencies, so we + // use tryLock to attempt to acquire the lock, and if the lock cannot be acquired, we skip the + // check. This will not cause any additional problems and also prevents the regionServerReport + // call from being stuck for too long which may cause deadlock on region assignment. if (regionNode.tryLock()) { try { long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate();