diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index 03ff914d7bf0..e3b0318f1d95 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -1473,7 +1473,7 @@ public enum OperationStatusCode { // Regions Recovery based on high storeFileRefCount threshold value public static final String STORE_FILE_REF_COUNT_THRESHOLD = - "hbase.regions.recovery.store.file.count"; + "hbase.regions.recovery.store.file.ref.count"; // default -1 indicates there is no threshold on high storeRefCount public static final int DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD = -1; diff --git a/hbase-common/src/main/resources/hbase-default.xml b/hbase-common/src/main/resources/hbase-default.xml index 77c06d77aa6a..f738f0500b6d 100644 --- a/hbase-common/src/main/resources/hbase-default.xml +++ b/hbase-common/src/main/resources/hbase-default.xml @@ -1912,15 +1912,22 @@ possible configurations would overwhelm and obscure the important. - hbase.regions.recovery.store.file.count + hbase.regions.recovery.store.file.ref.count -1 - Store files Ref Count threshold value considered - for reopening regions. Any region with store files - ref count > this value would be eligible for - reopening by master. Default value -1 indicates - this feature is turned off. Only positive integer - value should be provided to enable the feature. + Very large ref count on a file indicates + that it is a ref leak on that object. Such files + can not be removed even after it is invalidated + via compaction. Only way to recover in such + scenario is to reopen the region which can + release all resources, like the refcount, leases, etc. + This config represents Store files Ref Count threshold + value considered for reopening regions. + Any region with store files ref count > this value + would be eligible for reopening by master. + Default value -1 indicates this feature is turned off. + Only positive integer value should be provided to enable + this feature. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 36fd36bdb453..fa95e1bd8320 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -1471,7 +1471,7 @@ private void startServiceThreads() throws IOException { getChoreService().scheduleChore(hfileCleaner); // Regions Reopen based on very high storeFileRefCount is considered enabled - // only if hbase.regions.recovery.store.file.count has value > 0 + // only if hbase.regions.recovery.store.file.ref.count has value > 0 final int maxStoreFileRefCount = conf.getInt( HConstants.STORE_FILE_REF_COUNT_THRESHOLD, HConstants.DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionsRecoveryChore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionsRecoveryChore.java index dabc371b7859..7502eeb94621 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionsRecoveryChore.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionsRecoveryChore.java @@ -141,9 +141,10 @@ private Map> getTableToRegionsByRefCount( // Here, we take max ref count of all store files and not the cumulative // count of all store files final int maxStoreFileRefCount = regionMetrics.getMaxStoreFileRefCount(); - // ignore store file ref count threshold <= 0 (default is -1 i.e. disabled) - if (storeFileRefCountThreshold > 0 && maxStoreFileRefCount > storeFileRefCountThreshold) { - prepareTableToReopenRegionsMap(tableToReopenRegionsMap, regionMetrics, + + if (maxStoreFileRefCount > storeFileRefCountThreshold) { + final byte[] regionName = regionMetrics.getRegionName(); + prepareTableToReopenRegionsMap(tableToReopenRegionsMap, regionName, maxStoreFileRefCount); } } @@ -154,9 +155,8 @@ private Map> getTableToRegionsByRefCount( private void prepareTableToReopenRegionsMap( final Map> tableToReopenRegionsMap, - final RegionMetrics regionMetrics, final int regionStoreRefCount) { + final byte[] regionName, final int regionStoreRefCount) { - final byte[] regionName = regionMetrics.getRegionName(); final RegionInfo regionInfo = hMaster.getAssignmentManager().getRegionInfo(regionName); final TableName tableName = regionInfo.getTable(); if (TableName.isMetaTableName(tableName)) { @@ -166,9 +166,7 @@ private void prepareTableToReopenRegionsMap( } LOG.warn("Region {} for Table {} has high storeFileRefCount {}, considering it for reopen..", regionInfo.getRegionNameAsString(), tableName, regionStoreRefCount); - if (!tableToReopenRegionsMap.containsKey(tableName)) { - tableToReopenRegionsMap.put(tableName, new ArrayList<>()); - } + tableToReopenRegionsMap.putIfAbsent(tableName, new ArrayList<>()); tableToReopenRegionsMap.get(tableName).add(regionName); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index e6487b96f4ca..7bf834c62c8c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -57,25 +57,25 @@ public class ReopenTableRegionsProcedure // Specify specific regions of a table to reopen. // if specified null, all regions of the table will be reopened. - private final List regionNamesList; + private final List regionNames; private List regions = Collections.emptyList(); private RetryCounter retryCounter; public ReopenTableRegionsProcedure() { - regionNamesList = null; + regionNames = null; } public ReopenTableRegionsProcedure(TableName tableName) { this.tableName = tableName; - this.regionNamesList = null; + this.regionNames = null; } public ReopenTableRegionsProcedure(final TableName tableName, - final List regionNamesList) { + final List regionNames) { this.tableName = tableName; - this.regionNamesList = regionNamesList; + this.regionNames = regionNames; } @Override @@ -109,9 +109,9 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState LOG.info("Table {} is disabled, give up reopening its regions", tableName); return Flow.NO_MORE_STATE; } - List tableRegionsForReopen = env.getAssignmentManager() + List tableRegions = env.getAssignmentManager() .getRegionStates().getRegionsOfTableForReopen(tableName); - regions = prepareRegionsForReopen(tableRegionsForReopen); + regions = getRegionLocationsForReopen(tableRegions); setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); return Flow.HAS_MORE_STATE; case REOPEN_TABLE_REGIONS_REOPEN_REGIONS: @@ -167,13 +167,13 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState } } - private List prepareRegionsForReopen( + private List getRegionLocationsForReopen( List tableRegionsForReopen) { List regionsToReopen = new ArrayList<>(); - if (CollectionUtils.isNotEmpty(regionNamesList) && + if (CollectionUtils.isNotEmpty(regionNames) && CollectionUtils.isNotEmpty(tableRegionsForReopen)) { - for (byte[] regionName : regionNamesList) { + for (byte[] regionName : regionNames) { for (HRegionLocation hRegionLocation : tableRegionsForReopen) { if (Bytes.equals(regionName, hRegionLocation.getRegion().getRegionName())) { regionsToReopen.add(hRegionLocation); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestRegionsRecoveryChore.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestRegionsRecoveryChore.java index 49346f19bb90..69deac95c494 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestRegionsRecoveryChore.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestRegionsRecoveryChore.java @@ -129,7 +129,7 @@ public void testRegionReopensWithStoreRefConfig() throws Exception { } Stoppable stoppable = new StoppableImplementation(); Configuration configuration = getCustomConf(); - configuration.setInt("hbase.regions.recovery.store.file.count", 300); + configuration.setInt("hbase.regions.recovery.store.file.ref.count", 300); regionsRecoveryChore = new RegionsRecoveryChore(stoppable, configuration, hMaster); regionsRecoveryChore.chore(); @@ -144,6 +144,43 @@ public void testRegionReopensWithStoreRefConfig() throws Exception { .getRegionInfo(Mockito.any()); } + @Test + public void testRegionReopensWithLessThreshold() throws Exception { + regionNo = 0; + ClusterMetrics clusterMetrics = TestRegionsRecoveryChore.getClusterMetrics(4); + final Map serverMetricsMap = + clusterMetrics.getLiveServerMetrics(); + LOG.debug("All Region Names with refCount...."); + for (ServerMetrics serverMetrics : serverMetricsMap.values()) { + Map regionMetricsMap = serverMetrics.getRegionMetrics(); + for (RegionMetrics regionMetrics : regionMetricsMap.values()) { + LOG.debug("name: " + new String(regionMetrics.getRegionName()) + " refCount: " + + regionMetrics.getStoreRefCount()); + } + } + Mockito.when(hMaster.getClusterMetrics()).thenReturn(clusterMetrics); + Mockito.when(hMaster.getAssignmentManager()).thenReturn(assignmentManager); + for (byte[] regionName : REGION_NAME_LIST) { + Mockito.when(assignmentManager.getRegionInfo(regionName)) + .thenReturn(TestRegionsRecoveryChore.getRegionInfo(regionName)); + } + Stoppable stoppable = new StoppableImplementation(); + Configuration configuration = getCustomConf(); + configuration.setInt("hbase.regions.recovery.store.file.ref.count", 400); + regionsRecoveryChore = new RegionsRecoveryChore(stoppable, configuration, hMaster); + regionsRecoveryChore.chore(); + + // Verify that we need to reopen regions of only 1 table + Mockito.verify(hMaster, Mockito.times(1)).reopenRegions(Mockito.any(), Mockito.anyList(), + Mockito.anyLong(), Mockito.anyLong()); + Mockito.verify(hMaster, Mockito.times(1)).getClusterMetrics(); + + // Verify that we need to reopen only 1 region with refCount > 400 + Mockito.verify(hMaster, Mockito.times(1)).getAssignmentManager(); + Mockito.verify(assignmentManager, Mockito.times(1)) + .getRegionInfo(Mockito.any()); + } + @Test public void testRegionReopensWithoutStoreRefConfig() throws Exception { regionNo = 0; @@ -166,7 +203,7 @@ public void testRegionReopensWithoutStoreRefConfig() throws Exception { } Stoppable stoppable = new StoppableImplementation(); Configuration configuration = getCustomConf(); - configuration.unset("hbase.regions.recovery.store.file.count"); + configuration.unset("hbase.regions.recovery.store.file.ref.count"); regionsRecoveryChore = new RegionsRecoveryChore(stoppable, configuration, hMaster); regionsRecoveryChore.chore(); diff --git a/src/main/asciidoc/_chapters/hbase-default.adoc b/src/main/asciidoc/_chapters/hbase-default.adoc index 0a37268fc1f5..59a3a073c362 100644 --- a/src/main/asciidoc/_chapters/hbase-default.adoc +++ b/src/main/asciidoc/_chapters/hbase-default.adoc @@ -2178,17 +2178,24 @@ The percent of region server RPC threads failed to abort RS. `1200000` -[[hbase.regions.recovery.store.file.count]] -*`hbase.regions.recovery.store.file.count`*:: -+ -.Description - - Store files Ref Count threshold value considered - for reopening regions. Any region with store files - ref count > this value would be eligible for - reopening by master. Default value -1 indicates - this feature is turned off. Only positive integer - value should be provided to enable the feature. +[[hbase.regions.recovery.store.file.ref.count]] +*`hbase.regions.recovery.store.file.ref.count`*:: ++ +.Description + + Very large ref count on a file indicates + that it is a ref leak on that object. Such files + can not be removed even after it is invalidated + via compaction. Only way to recover in such + scenario is to reopen the region which can + release all resources, like the refcount, leases, etc. + This config represents Store files Ref Count threshold + value considered for reopening regions. + Any region with store files ref count > this value + would be eligible for reopening by master. + Default value -1 indicates this feature is turned off. + Only positive integer value should be provided to enable + this feature. + .Default