Skip to content

Commit

Permalink
HBASE-28113 Modify the way of acquiring the RegionStateNode lock in c…
Browse files Browse the repository at this point in the history
…heckOnlineRegionsReport to tryLock (apache#5442)

* To prevent threads from being blocked by the lock of RegionStateNode, modify the way of acquiring the RegionStateNode lock in checkOnlineRegionsReport to tryLock.

Co-authored-by: lvhaiping.lhp <[email protected]>
Signed-off-by: Duo Zhang <[email protected]>
  • Loading branch information
2 people authored and wchevreuil committed Nov 2, 2023
1 parent d244956 commit 522ba04
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1398,29 +1398,39 @@ private void checkOnlineRegionsReport(ServerStateNode serverNode, Set<byte[]> re
continue;
}
final long lag = 1000;
regionNode.lock();
try {
long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate();
if (regionNode.isInState(State.OPENING, State.OPEN)) {
// This is possible as a region server has just closed a region but the region server
// report is generated before the closing, but arrive after the closing. Make sure there
// is some elapsed time so less false alarms.
if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) {
LOG.warn("Reporting {} server does not match {} (time since last "
+ "update={}ms); closing...", serverName, regionNode, diff);
closeRegionSilently(serverNode.getServerName(), regionName);
}
} else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) {
// So, we can get report that a region is CLOSED or SPLIT because a heartbeat
// came in at about same time as a region transition. Make sure there is some
// elapsed time so less false alarms.
if (diff > lag) {
LOG.warn("Reporting {} state does not match {} (time since last update={}ms)",
serverName, regionNode, diff);
// This is just a fallback check designed to identify unexpected data inconsistencies, so we
// use tryLock to attempt to acquire the lock, and if the lock cannot be acquired, we skip the
// check. This will not cause any additional problems and also prevents the regionServerReport
// call from being stuck for too long which may cause deadlock on region assignment.
if (regionNode.tryLock()) {
try {
long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate();
if (regionNode.isInState(State.OPENING, State.OPEN)) {
// This is possible as a region server has just closed a region but the region server
// report is generated before the closing, but arrive after the closing. Make sure
// there
// is some elapsed time so less false alarms.
if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) {
LOG.warn("Reporting {} server does not match {} (time since last "
+ "update={}ms); closing...", serverName, regionNode, diff);
closeRegionSilently(serverNode.getServerName(), regionName);
}
} else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) {
// So, we can get report that a region is CLOSED or SPLIT because a heartbeat
// came in at about same time as a region transition. Make sure there is some
// elapsed time so less false alarms.
if (diff > lag) {
LOG.warn("Reporting {} state does not match {} (time since last update={}ms)",
serverName, regionNode, diff);
}
}
} finally {
regionNode.unlock();
}
} finally {
regionNode.unlock();
} else {
LOG.warn(
"Unable to acquire lock for regionNode {}. It is likely that another thread is currently holding the lock. To avoid deadlock, skip execution for now.",
regionNode);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,10 @@ public void lock() {
lock.lock();
}

public boolean tryLock() {
return lock.tryLock();
}

public void unlock() {
lock.unlock();
}
Expand Down

0 comments on commit 522ba04

Please sign in to comment.