diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/RegionMetrics.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/RegionMetrics.java index 3905f8916631..4d9ebc28eb2b 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/RegionMetrics.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/RegionMetrics.java @@ -154,4 +154,10 @@ default String getNameAsString() { * @return the reference count for the stores of this region */ int getStoreRefCount(); + + /** + * @return the max reference count for any store file among all stores files + * of this region + */ + int getMaxStoreFileRefCount(); } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/RegionMetricsBuilder.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/RegionMetricsBuilder.java index acd493f521b3..25d44a933a85 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/RegionMetricsBuilder.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/RegionMetricsBuilder.java @@ -66,6 +66,7 @@ public static RegionMetrics toRegionMetrics(ClusterStatusProtos.RegionLoad regio .setStoreCount(regionLoadPB.getStores()) .setStoreFileCount(regionLoadPB.getStorefiles()) .setStoreRefCount(regionLoadPB.getStoreRefCount()) + .setMaxStoreFileRefCount(regionLoadPB.getMaxStoreFileRefCount()) .setStoreFileSize(new Size(regionLoadPB.getStorefileSizeMB(), Size.Unit.MEGABYTE)) .setStoreSequenceIds(regionLoadPB.getStoreCompleteSequenceIdList().stream() .collect(Collectors.toMap( @@ -113,6 +114,7 @@ public static ClusterStatusProtos.RegionLoad toRegionLoad(RegionMetrics regionMe .setStores(regionMetrics.getStoreCount()) .setStorefiles(regionMetrics.getStoreFileCount()) .setStoreRefCount(regionMetrics.getStoreRefCount()) + .setMaxStoreFileRefCount(regionMetrics.getMaxStoreFileRefCount()) .setStorefileSizeMB((int) regionMetrics.getStoreFileSize().get(Size.Unit.MEGABYTE)) .addAllStoreCompleteSequenceId(toStoreSequenceId(regionMetrics.getStoreSequenceId())) .setStoreUncompressedSizeMB( @@ -128,6 +130,7 @@ public static RegionMetricsBuilder newBuilder(byte[] name) { private int storeCount; private int storeFileCount; private int storeRefCount; + private int maxStoreFileRefCount; private long compactingCellCount; private long compactedCellCount; private Size storeFileSize = Size.ZERO; @@ -161,6 +164,10 @@ public RegionMetricsBuilder setStoreRefCount(int value) { this.storeRefCount = value; return this; } + public RegionMetricsBuilder setMaxStoreFileRefCount(int value) { + this.maxStoreFileRefCount = value; + return this; + } public RegionMetricsBuilder setCompactingCellCount(long value) { this.compactingCellCount = value; return this; @@ -235,6 +242,7 @@ public RegionMetrics build() { storeCount, storeFileCount, storeRefCount, + maxStoreFileRefCount, compactingCellCount, compactedCellCount, storeFileSize, @@ -259,6 +267,7 @@ private static class RegionMetricsImpl implements RegionMetrics { private final int storeCount; private final int storeFileCount; private final int storeRefCount; + private final int maxStoreFileRefCount; private final long compactingCellCount; private final long compactedCellCount; private final Size storeFileSize; @@ -280,6 +289,7 @@ private static class RegionMetricsImpl implements RegionMetrics { int storeCount, int storeFileCount, int storeRefCount, + int maxStoreFileRefCount, final long compactingCellCount, long compactedCellCount, Size storeFileSize, @@ -301,6 +311,7 @@ private static class RegionMetricsImpl implements RegionMetrics { this.storeCount = storeCount; this.storeFileCount = storeFileCount; this.storeRefCount = storeRefCount; + this.maxStoreFileRefCount = maxStoreFileRefCount; this.compactingCellCount = compactingCellCount; this.compactedCellCount = compactedCellCount; this.storeFileSize = Preconditions.checkNotNull(storeFileSize); @@ -340,6 +351,11 @@ public int getStoreRefCount() { return storeRefCount; } + @Override + public int getMaxStoreFileRefCount() { + return maxStoreFileRefCount; + } + @Override public Size getStoreFileSize() { return storeFileSize; @@ -433,6 +449,8 @@ public String toString() { this.getStoreFileCount()); Strings.appendKeyValue(sb, "storeRefCount", this.getStoreRefCount()); + Strings.appendKeyValue(sb, "maxStoreFileRefCount", + this.getMaxStoreFileRefCount()); Strings.appendKeyValue(sb, "uncompressedStoreFileSize", this.getUncompressedStoreFileSize()); Strings.appendKeyValue(sb, "lastMajorCompactionTimestamp", diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/ServerMetricsBuilder.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/ServerMetricsBuilder.java index 5721660033a4..b408f0268864 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/ServerMetricsBuilder.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/ServerMetricsBuilder.java @@ -343,6 +343,8 @@ public long getLastReportTimestamp() { public String toString() { int storeCount = 0; int storeFileCount = 0; + int storeRefCount = 0; + int maxStoreFileRefCount = 0; long uncompressedStoreFileSizeMB = 0; long storeFileSizeMB = 0; long memStoreSizeMB = 0; @@ -358,6 +360,9 @@ public String toString() { for (RegionMetrics r : getRegionMetrics().values()) { storeCount += r.getStoreCount(); storeFileCount += r.getStoreFileCount(); + storeRefCount += r.getStoreRefCount(); + int currentMaxStoreFileRefCount = r.getMaxStoreFileRefCount(); + maxStoreFileRefCount = Math.max(maxStoreFileRefCount, currentMaxStoreFileRefCount); uncompressedStoreFileSizeMB += r.getUncompressedStoreFileSize().get(Size.Unit.MEGABYTE); storeFileSizeMB += r.getStoreFileSize().get(Size.Unit.MEGABYTE); memStoreSizeMB += r.getMemStoreSize().get(Size.Unit.MEGABYTE); @@ -379,6 +384,8 @@ public String toString() { Strings.appendKeyValue(sb, "maxHeapMB", getMaxHeapSize()); Strings.appendKeyValue(sb, "numberOfStores", storeCount); Strings.appendKeyValue(sb, "numberOfStorefiles", storeFileCount); + Strings.appendKeyValue(sb, "storeRefCount", storeRefCount); + Strings.appendKeyValue(sb, "maxStoreFileRefCount", maxStoreFileRefCount); Strings.appendKeyValue(sb, "storefileUncompressedSizeMB", uncompressedStoreFileSizeMB); Strings.appendKeyValue(sb, "storefileSizeMB", storeFileSizeMB); if (uncompressedStoreFileSizeMB != 0) { diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index 8ee48c9f5290..d339ca569e6f 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -1474,6 +1474,13 @@ public enum OperationStatusCode { // User defined Default TTL config key public static final String DEFAULT_SNAPSHOT_TTL_CONFIG_KEY = "hbase.master.snapshot.ttl"; + // Regions Recovery based on high storeFileRefCount threshold value + public static final String STORE_FILE_REF_COUNT_THRESHOLD = + "hbase.regions.recovery.store.file.ref.count"; + + // default -1 indicates there is no threshold on high storeRefCount + public static final int DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD = -1; + /** * Configurations for master executor services. */ diff --git a/hbase-common/src/main/resources/hbase-default.xml b/hbase-common/src/main/resources/hbase-default.xml index 877cd744f828..f738f0500b6d 100644 --- a/hbase-common/src/main/resources/hbase-default.xml +++ b/hbase-common/src/main/resources/hbase-default.xml @@ -1901,4 +1901,33 @@ possible configurations would overwhelm and obscure the important. automatically deleted until it is manually deleted + + hbase.master.regions.recovery.check.interval + 1200000 + + Regions Recovery Chore interval in milliseconds. + This chore keeps running at this interval to + find all regions with configurable max store file ref count + and reopens them. + + + + hbase.regions.recovery.store.file.ref.count + -1 + + Very large ref count on a file indicates + that it is a ref leak on that object. Such files + can not be removed even after it is invalidated + via compaction. Only way to recover in such + scenario is to reopen the region which can + release all resources, like the refcount, leases, etc. + This config represents Store files Ref Count threshold + value considered for reopening regions. + Any region with store files ref count > this value + would be eligible for reopening by master. + Default value -1 indicates this feature is turned off. + Only positive integer value should be provided to enable + this feature. + + diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java index b7fad253fc29..f4fbf390ae75 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java @@ -233,6 +233,7 @@ public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSo String STOREFILE_COUNT_DESC = "Number of Store Files"; String STORE_REF_COUNT = "storeRefCount"; String STORE_REF_COUNT_DESC = "Store reference count"; + String MAX_STORE_FILE_REF_COUNT = "maxStoreFileRefCount"; String MEMSTORE_SIZE = "memStoreSize"; String MEMSTORE_SIZE_DESC = "Size of the memstore"; String STOREFILE_SIZE = "storeFileSize"; diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapper.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapper.java index 6f8f60ba4ea5..ccdc209575f9 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapper.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapper.java @@ -164,4 +164,10 @@ public interface MetricsRegionWrapper { * @return the number of references active on the store */ long getStoreRefCount(); + + /** + * @return the max number of references active on any store file among + * all store files that belong to this region + */ + long getMaxStoreFileRefCount(); } diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionSourceImpl.java index 265df31449fd..bc8ecb930dc8 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionSourceImpl.java @@ -217,6 +217,10 @@ void snapshot(MetricsRecordBuilder mrb, boolean ignored) { regionNamePrefix + MetricsRegionServerSource.STORE_REF_COUNT, MetricsRegionServerSource.STORE_REF_COUNT), this.regionWrapper.getStoreRefCount()); + mrb.addGauge(Interns.info( + regionNamePrefix + MetricsRegionServerSource.MAX_STORE_FILE_REF_COUNT, + MetricsRegionServerSource.MAX_STORE_FILE_REF_COUNT), + this.regionWrapper.getMaxStoreFileRefCount()); mrb.addGauge(Interns.info( regionNamePrefix + MetricsRegionServerSource.MEMSTORE_SIZE, MetricsRegionServerSource.MEMSTORE_SIZE_DESC), diff --git a/hbase-hadoop2-compat/src/test/java/org/apache/hadoop/hbase/regionserver/TestMetricsRegionSourceImpl.java b/hbase-hadoop2-compat/src/test/java/org/apache/hadoop/hbase/regionserver/TestMetricsRegionSourceImpl.java index 78fc397f0d1d..bafa854feb39 100644 --- a/hbase-hadoop2-compat/src/test/java/org/apache/hadoop/hbase/regionserver/TestMetricsRegionSourceImpl.java +++ b/hbase-hadoop2-compat/src/test/java/org/apache/hadoop/hbase/regionserver/TestMetricsRegionSourceImpl.java @@ -99,6 +99,11 @@ public long getStoreRefCount() { return 0; } + @Override + public long getMaxStoreFileRefCount() { + return 0; + } + @Override public long getMemStoreSize() { return 0; diff --git a/hbase-protocol-shaded/src/main/protobuf/ClusterStatus.proto b/hbase-protocol-shaded/src/main/protobuf/ClusterStatus.proto index ef16ecbac13a..cdc9c06bbb4b 100644 --- a/hbase-protocol-shaded/src/main/protobuf/ClusterStatus.proto +++ b/hbase-protocol-shaded/src/main/protobuf/ClusterStatus.proto @@ -149,6 +149,12 @@ message RegionLoad { /** the number of references active on the store */ optional int32 store_ref_count = 21 [default = 0]; + + /** + * The max number of references active on single store file among all store files + * that belong to given region + */ + optional int32 max_store_file_ref_count = 22 [default = 0]; } /* Server-level protobufs */ diff --git a/hbase-protocol/src/main/protobuf/ClusterStatus.proto b/hbase-protocol/src/main/protobuf/ClusterStatus.proto index 4e89fe6d279d..f920fff79f16 100644 --- a/hbase-protocol/src/main/protobuf/ClusterStatus.proto +++ b/hbase-protocol/src/main/protobuf/ClusterStatus.proto @@ -145,6 +145,12 @@ message RegionLoad { /** the number of references active on the store */ optional int32 store_ref_count = 21 [default = 0]; + + /** + * The max number of references active on single store file among all store files + * that belong to given region + */ + optional int32 max_store_file_ref_count = 22 [default = 0]; } /* Server-level protobufs */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 98841f95c838..d077514715fb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -136,6 +136,7 @@ import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch; import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait; import org.apache.hadoop.hbase.master.procedure.RecoverMetaProcedure; +import org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; import org.apache.hadoop.hbase.master.procedure.TruncateTableProcedure; import org.apache.hadoop.hbase.master.replication.AbstractPeerProcedure; @@ -421,6 +422,7 @@ public void run() { // monitor for distributed procedures private MasterProcedureManagerHost mpmHost; + private RegionsRecoveryChore regionsRecoveryChore = null; // it is assigned after 'initialized' guard set to true, so should be volatile private volatile MasterQuotaManager quotaManager; private SpaceQuotaSnapshotNotifier spaceQuotaSnapshotNotifier; @@ -1464,6 +1466,20 @@ private void startServiceThreads() throws IOException { getMasterFileSystem().getFileSystem(), archiveDir, cleanerPool, params); getChoreService().scheduleChore(hfileCleaner); + // Regions Reopen based on very high storeFileRefCount is considered enabled + // only if hbase.regions.recovery.store.file.ref.count has value > 0 + final int maxStoreFileRefCount = conf.getInt( + HConstants.STORE_FILE_REF_COUNT_THRESHOLD, + HConstants.DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD); + if (maxStoreFileRefCount > 0) { + this.regionsRecoveryChore = new RegionsRecoveryChore(this, conf, this); + getChoreService().scheduleChore(this.regionsRecoveryChore); + } else { + LOG.info("Reopening regions with very high storeFileRefCount is disabled. " + + "Provide threshold value > 0 for {} to enable it.", + HConstants.STORE_FILE_REF_COUNT_THRESHOLD); + } + replicationBarrierCleaner = new ReplicationBarrierCleaner(conf, this, getConnection(), replicationPeerManager); getChoreService().scheduleChore(replicationBarrierCleaner); @@ -1631,6 +1647,7 @@ private void stopChores() { choreService.cancelChore(this.replicationBarrierCleaner); choreService.cancelChore(this.snapshotCleanerChore); choreService.cancelChore(this.hbckChore); + choreService.cancelChore(this.regionsRecoveryChore); } } @@ -3732,6 +3749,38 @@ public void remoteProcedureFailed(long procId, RemoteProcedureException error) { } } + /** + * Reopen regions provided in the argument + * + * @param tableName The current table name + * @param regionNames The region names of the regions to reopen + * @param nonceGroup Identifier for the source of the request, a client or process + * @param nonce A unique identifier for this operation from the client or process identified by + * nonceGroup (the source must ensure each operation gets a unique id). + * @return procedure Id + * @throws IOException if reopening region fails while running procedure + */ + long reopenRegions(final TableName tableName, final List regionNames, + final long nonceGroup, final long nonce) + throws IOException { + + return MasterProcedureUtil + .submitProcedure(new MasterProcedureUtil.NonceProcedureRunnable(this, nonceGroup, nonce) { + + @Override + protected void run() throws IOException { + submitProcedure(new ReopenTableRegionsProcedure(tableName, regionNames)); + } + + @Override + protected String getDescription() { + return "ReopenTableRegionsProcedure"; + } + + }); + + } + @Override public ReplicationPeerManager getReplicationPeerManager() { return replicationPeerManager; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionsRecoveryChore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionsRecoveryChore.java new file mode 100644 index 000000000000..7502eeb94621 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionsRecoveryChore.java @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.master; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.ClusterMetrics; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.RegionMetrics; +import org.apache.hadoop.hbase.ScheduledChore; +import org.apache.hadoop.hbase.ServerMetrics; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.Stoppable; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.PerClientRandomNonceGenerator; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.yetus.audience.InterfaceAudience; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hbase.thirdparty.org.apache.commons.collections4.MapUtils; + +/** + * This chore, every time it runs, will try to recover regions with high store ref count + * by reopening them + */ +@InterfaceAudience.Private +public class RegionsRecoveryChore extends ScheduledChore { + + private static final Logger LOG = LoggerFactory.getLogger(RegionsRecoveryChore.class); + + private static final String REGIONS_RECOVERY_CHORE_NAME = "RegionsRecoveryChore"; + + private static final String REGIONS_RECOVERY_INTERVAL = + "hbase.master.regions.recovery.check.interval"; + + private static final int DEFAULT_REGIONS_RECOVERY_INTERVAL = 1200 * 1000; // Default 20 min ? + + private static final String ERROR_REOPEN_REIONS_MSG = + "Error reopening regions with high storeRefCount. "; + + private final HMaster hMaster; + private final int storeFileRefCountThreshold; + + private static final PerClientRandomNonceGenerator NONCE_GENERATOR = + PerClientRandomNonceGenerator.get(); + + /** + * Construct RegionsRecoveryChore with provided params + * + * @param stopper When {@link Stoppable#isStopped()} is true, this chore will cancel and cleanup + * @param configuration The configuration params to be used + * @param hMaster HMaster instance to initiate RegionTableRegions + */ + RegionsRecoveryChore(final Stoppable stopper, final Configuration configuration, + final HMaster hMaster) { + + super(REGIONS_RECOVERY_CHORE_NAME, stopper, configuration.getInt(REGIONS_RECOVERY_INTERVAL, + DEFAULT_REGIONS_RECOVERY_INTERVAL)); + this.hMaster = hMaster; + this.storeFileRefCountThreshold = configuration.getInt( + HConstants.STORE_FILE_REF_COUNT_THRESHOLD, + HConstants.DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD); + + } + + @Override + protected void chore() { + if (LOG.isTraceEnabled()) { + LOG.trace( + "Starting up Regions Recovery chore for reopening regions based on storeFileRefCount..."); + } + try { + // only if storeFileRefCountThreshold > 0, consider the feature turned on + if (storeFileRefCountThreshold > 0) { + final ClusterMetrics clusterMetrics = hMaster.getClusterMetrics(); + final Map serverMetricsMap = + clusterMetrics.getLiveServerMetrics(); + final Map> tableToReopenRegionsMap = + getTableToRegionsByRefCount(serverMetricsMap); + if (MapUtils.isNotEmpty(tableToReopenRegionsMap)) { + tableToReopenRegionsMap.forEach((tableName, regionNames) -> { + try { + LOG.warn("Reopening regions due to high storeFileRefCount. " + + "TableName: {} , noOfRegions: {}", tableName, regionNames.size()); + hMaster.reopenRegions(tableName, regionNames, NONCE_GENERATOR.getNonceGroup(), + NONCE_GENERATOR.newNonce()); + } catch (IOException e) { + LOG.error("{} tableName: {}, regionNames: {}", ERROR_REOPEN_REIONS_MSG, + tableName, regionNames, e); + } + }); + } + } else { + if (LOG.isDebugEnabled()) { + LOG.debug("Reopening regions with very high storeFileRefCount is disabled. " + + "Provide threshold value > 0 for {} to enable it.", + HConstants.STORE_FILE_REF_COUNT_THRESHOLD); + } + } + } catch (Exception e) { + LOG.error("Error while reopening regions based on storeRefCount threshold", e); + } + if (LOG.isTraceEnabled()) { + LOG.trace( + "Exiting Regions Recovery chore for reopening regions based on storeFileRefCount..."); + } + } + + private Map> getTableToRegionsByRefCount( + final Map serverMetricsMap) { + + final Map> tableToReopenRegionsMap = new HashMap<>(); + for (ServerMetrics serverMetrics : serverMetricsMap.values()) { + Map regionMetricsMap = serverMetrics.getRegionMetrics(); + for (RegionMetrics regionMetrics : regionMetricsMap.values()) { + // For each region, each store file can have different ref counts + // We need to find maximum of all such ref counts and if that max count + // is beyond a threshold value, we should reopen the region. + // Here, we take max ref count of all store files and not the cumulative + // count of all store files + final int maxStoreFileRefCount = regionMetrics.getMaxStoreFileRefCount(); + + if (maxStoreFileRefCount > storeFileRefCountThreshold) { + final byte[] regionName = regionMetrics.getRegionName(); + prepareTableToReopenRegionsMap(tableToReopenRegionsMap, regionName, + maxStoreFileRefCount); + } + } + } + return tableToReopenRegionsMap; + + } + + private void prepareTableToReopenRegionsMap( + final Map> tableToReopenRegionsMap, + final byte[] regionName, final int regionStoreRefCount) { + + final RegionInfo regionInfo = hMaster.getAssignmentManager().getRegionInfo(regionName); + final TableName tableName = regionInfo.getTable(); + if (TableName.isMetaTableName(tableName)) { + // Do not reopen regions of meta table even if it has + // high store file reference count + return; + } + LOG.warn("Region {} for Table {} has high storeFileRefCount {}, considering it for reopen..", + regionInfo.getRegionNameAsString(), tableName, regionStoreRefCount); + tableToReopenRegionsMap.putIfAbsent(tableName, new ArrayList<>()); + tableToReopenRegionsMap.get(tableName).add(regionName); + + } + +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index 3dacb5633683..7bf834c62c8c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -18,9 +18,11 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; + import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.master.assignment.RegionStateNode; @@ -29,11 +31,14 @@ import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.procedure2.ProcedureUtil; import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; +import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils; + import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.ReopenTableRegionsState; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.ReopenTableRegionsStateData; @@ -50,15 +55,27 @@ public class ReopenTableRegionsProcedure private TableName tableName; + // Specify specific regions of a table to reopen. + // if specified null, all regions of the table will be reopened. + private final List regionNames; + private List regions = Collections.emptyList(); private RetryCounter retryCounter; public ReopenTableRegionsProcedure() { + regionNames = null; } public ReopenTableRegionsProcedure(TableName tableName) { this.tableName = tableName; + this.regionNames = null; + } + + public ReopenTableRegionsProcedure(final TableName tableName, + final List regionNames) { + this.tableName = tableName; + this.regionNames = regionNames; } @Override @@ -92,8 +109,9 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState LOG.info("Table {} is disabled, give up reopening its regions", tableName); return Flow.NO_MORE_STATE; } - regions = - env.getAssignmentManager().getRegionStates().getRegionsOfTableForReopen(tableName); + List tableRegions = env.getAssignmentManager() + .getRegionStates().getRegionsOfTableForReopen(tableName); + regions = getRegionLocationsForReopen(tableRegions); setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); return Flow.HAS_MORE_STATE; case REOPEN_TABLE_REGIONS_REOPEN_REGIONS: @@ -149,6 +167,26 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState } } + private List getRegionLocationsForReopen( + List tableRegionsForReopen) { + + List regionsToReopen = new ArrayList<>(); + if (CollectionUtils.isNotEmpty(regionNames) && + CollectionUtils.isNotEmpty(tableRegionsForReopen)) { + for (byte[] regionName : regionNames) { + for (HRegionLocation hRegionLocation : tableRegionsForReopen) { + if (Bytes.equals(regionName, hRegionLocation.getRegion().getRegionName())) { + regionsToReopen.add(hRegionLocation); + break; + } + } + } + } else { + regionsToReopen = tableRegionsForReopen; + } + return regionsToReopen; + } + /** * At end of timeout, wake ourselves up so we run again. */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 11fd8c7e54b7..af661a281dd7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -1655,6 +1655,8 @@ RegionLoad createRegionLoad(final HRegion r, RegionLoad.Builder regionLoadBldr, byte[] name = r.getRegionInfo().getRegionName(); int stores = 0; int storefiles = 0; + int storeRefCount = 0; + int maxStoreFileRefCount = 0; int storeUncompressedSizeMB = 0; int storefileSizeMB = 0; int memstoreSizeMB = (int) (r.getMemStoreDataSize() / 1024 / 1024); @@ -1668,6 +1670,10 @@ RegionLoad createRegionLoad(final HRegion r, RegionLoad.Builder regionLoadBldr, stores += storeList.size(); for (HStore store : storeList) { storefiles += store.getStorefilesCount(); + int currentStoreRefCount = store.getStoreRefCount(); + storeRefCount += currentStoreRefCount; + int currentMaxStoreFileRefCount = store.getMaxStoreFileRefCount(); + maxStoreFileRefCount = Math.max(maxStoreFileRefCount, currentMaxStoreFileRefCount); storeUncompressedSizeMB += (int) (store.getStoreSizeUncompressed() / 1024 / 1024); storefileSizeMB += (int) (store.getStorefilesSize() / 1024 / 1024); //TODO: storefileIndexSizeKB is same with rootLevelIndexSizeKB? @@ -1695,6 +1701,8 @@ RegionLoad createRegionLoad(final HRegion r, RegionLoad.Builder regionLoadBldr, regionLoadBldr.setRegionSpecifier(regionSpecifier.build()) .setStores(stores) .setStorefiles(storefiles) + .setStoreRefCount(storeRefCount) + .setMaxStoreFileRefCount(maxStoreFileRefCount) .setStoreUncompressedSizeMB(storeUncompressedSizeMB) .setStorefileSizeMB(storefileSizeMB) .setMemStoreSizeMB(memstoreSizeMB) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java index b5e64673146f..b007898e9641 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java @@ -33,6 +33,7 @@ import java.util.NavigableSet; import java.util.Optional; import java.util.OptionalDouble; +import java.util.OptionalInt; import java.util.OptionalLong; import java.util.Set; import java.util.concurrent.Callable; @@ -2780,4 +2781,20 @@ public int getStoreRefCount() { .filter(sf -> sf.getReader() != null).filter(HStoreFile::isHFile) .mapToInt(HStoreFile::getRefCount).sum(); } + + /** + * @return get maximum ref count of storeFile among all HStore Files + * for the HStore + */ + public int getMaxStoreFileRefCount() { + OptionalInt maxStoreFileRefCount = this.storeEngine.getStoreFileManager() + .getStorefiles() + .stream() + .filter(sf -> sf.getReader() != null) + .filter(HStoreFile::isHFile) + .mapToInt(HStoreFile::getRefCount) + .max(); + return maxStoreFileRefCount.isPresent() ? maxStoreFileRefCount.getAsInt() : 0; + } + } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapperImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapperImpl.java index 0c8a325f967b..e15623dff35e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapperImpl.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapperImpl.java @@ -49,6 +49,7 @@ public class MetricsRegionWrapperImpl implements MetricsRegionWrapper, Closeable private Runnable runnable; private long numStoreFiles; private long storeRefCount; + private long maxStoreFileRefCount; private long memstoreSize; private long storeFileSize; private long maxStoreFileAge; @@ -125,6 +126,11 @@ public long getStoreRefCount() { return storeRefCount; } + @Override + public long getMaxStoreFileRefCount() { + return maxStoreFileRefCount; + } + @Override public long getReadRequestCount() { return this.region.getReadRequestsCount(); @@ -233,6 +239,7 @@ public class HRegionMetricsWrapperRunnable implements Runnable { public void run() { long tempNumStoreFiles = 0; int tempStoreRefCount = 0; + int tempMaxStoreFileRefCount = 0; long tempMemstoreSize = 0; long tempStoreFileSize = 0; long tempMaxStoreFileAge = 0; @@ -240,13 +247,16 @@ public void run() { long tempNumReferenceFiles = 0; long tempMaxCompactionQueueSize = 0; long tempMaxFlushQueueSize = 0; - long avgAgeNumerator = 0; long numHFiles = 0; if (region.stores != null) { for (HStore store : region.stores.values()) { tempNumStoreFiles += store.getStorefilesCount(); - tempStoreRefCount += store.getStoreRefCount(); + int currentStoreRefCount = store.getStoreRefCount(); + tempStoreRefCount += currentStoreRefCount; + int currentMaxStoreFileRefCount = store.getMaxStoreFileRefCount(); + tempMaxStoreFileRefCount = Math.max(tempMaxStoreFileRefCount, + currentMaxStoreFileRefCount); tempMemstoreSize += store.getMemStoreSize().getDataSize(); tempStoreFileSize += store.getStorefilesSize(); OptionalLong storeMaxStoreFileAge = store.getMaxStoreFileAge(); @@ -274,6 +284,7 @@ public void run() { numStoreFiles = tempNumStoreFiles; storeRefCount = tempStoreRefCount; + maxStoreFileRefCount = tempMaxStoreFileRefCount; memstoreSize = tempMemstoreSize; storeFileSize = tempStoreFileSize; maxStoreFileAge = tempMaxStoreFileAge; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestRegionsRecoveryChore.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestRegionsRecoveryChore.java new file mode 100644 index 000000000000..524d445a68c5 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestRegionsRecoveryChore.java @@ -0,0 +1,613 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.master; + +import edu.umd.cs.findbugs.annotations.Nullable; +import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.ClusterMetrics; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.RegionMetrics; +import org.apache.hadoop.hbase.ServerMetrics; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.Size; +import org.apache.hadoop.hbase.Stoppable; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.RegionStatesCount; +import org.apache.hadoop.hbase.master.assignment.AssignmentManager; +import org.apache.hadoop.hbase.replication.ReplicationLoadSink; +import org.apache.hadoop.hbase.replication.ReplicationLoadSource; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.SmallTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.After; +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.mockito.Mockito; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Test for RegionsRecoveryChore + */ +@Category({MasterTests.class, SmallTests.class}) +public class TestRegionsRecoveryChore { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestRegionsRecoveryChore.class); + + private static final Logger LOG = LoggerFactory.getLogger(TestRegionsRecoveryChore.class); + + private static final HBaseTestingUtility HBASE_TESTING_UTILITY = new HBaseTestingUtility(); + + private static final String UTF_8_CHARSET = StandardCharsets.UTF_8.name(); + + private HMaster hMaster; + + private AssignmentManager assignmentManager; + + private RegionsRecoveryChore regionsRecoveryChore; + + private static int regionNo; + public static final byte[][] REGION_NAME_LIST = new byte[][]{ + new byte[]{114, 101, 103, 105, 111, 110, 50, 49, 95, 51}, + new byte[]{114, 101, 103, 105, 111, 110, 50, 53, 95, 51}, + new byte[]{114, 101, 103, 105, 111, 110, 50, 54, 95, 52}, + new byte[]{114, 101, 103, 105, 111, 110, 51, 50, 95, 53}, + new byte[]{114, 101, 103, 105, 111, 110, 51, 49, 95, 52}, + new byte[]{114, 101, 103, 105, 111, 110, 51, 48, 95, 51}, + new byte[]{114, 101, 103, 105, 111, 110, 50, 48, 95, 50}, + new byte[]{114, 101, 103, 105, 111, 110, 50, 52, 95, 50}, + new byte[]{114, 101, 103, 105, 111, 110, 50, 57, 95, 50}, + new byte[]{114, 101, 103, 105, 111, 110, 51, 53, 95, 50}, + new byte[]{114, 101, 103, 105, 111, 110, 49, 48, 56, 95, 49, 49} + }; + + private Configuration getCustomConf() { + Configuration conf = HBASE_TESTING_UTILITY.getConfiguration(); + conf.setInt("hbase.master.regions.recovery.check.interval", 100); + return conf; + } + + @Before + public void setUp() throws Exception { + this.hMaster = Mockito.mock(HMaster.class); + this.assignmentManager = Mockito.mock(AssignmentManager.class); + } + + @After + public void tearDown() throws Exception { + Mockito.verifyNoMoreInteractions(this.hMaster); + Mockito.verifyNoMoreInteractions(this.assignmentManager); + } + + @Test + public void testRegionReopensWithStoreRefConfig() throws Exception { + regionNo = 0; + ClusterMetrics clusterMetrics = TestRegionsRecoveryChore.getClusterMetrics(4); + final Map serverMetricsMap = + clusterMetrics.getLiveServerMetrics(); + LOG.debug("All Region Names with refCount...."); + for (ServerMetrics serverMetrics : serverMetricsMap.values()) { + Map regionMetricsMap = serverMetrics.getRegionMetrics(); + for (RegionMetrics regionMetrics : regionMetricsMap.values()) { + LOG.debug("name: " + new String(regionMetrics.getRegionName()) + " refCount: " + + regionMetrics.getStoreRefCount()); + } + } + Mockito.when(hMaster.getClusterMetrics()).thenReturn(clusterMetrics); + Mockito.when(hMaster.getAssignmentManager()).thenReturn(assignmentManager); + for (byte[] regionName : REGION_NAME_LIST) { + Mockito.when(assignmentManager.getRegionInfo(regionName)) + .thenReturn(TestRegionsRecoveryChore.getRegionInfo(regionName)); + } + Stoppable stoppable = new StoppableImplementation(); + Configuration configuration = getCustomConf(); + configuration.setInt("hbase.regions.recovery.store.file.ref.count", 300); + regionsRecoveryChore = new RegionsRecoveryChore(stoppable, configuration, hMaster); + regionsRecoveryChore.chore(); + + // Verify that we need to reopen regions of 2 tables + Mockito.verify(hMaster, Mockito.times(2)).reopenRegions(Mockito.any(), Mockito.anyList(), + Mockito.anyLong(), Mockito.anyLong()); + Mockito.verify(hMaster, Mockito.times(1)).getClusterMetrics(); + + // Verify that we need to reopen total 3 regions that have refCount > 300 + Mockito.verify(hMaster, Mockito.times(3)).getAssignmentManager(); + Mockito.verify(assignmentManager, Mockito.times(3)) + .getRegionInfo(Mockito.any()); + } + + @Test + public void testRegionReopensWithLessThreshold() throws Exception { + regionNo = 0; + ClusterMetrics clusterMetrics = TestRegionsRecoveryChore.getClusterMetrics(4); + final Map serverMetricsMap = + clusterMetrics.getLiveServerMetrics(); + LOG.debug("All Region Names with refCount...."); + for (ServerMetrics serverMetrics : serverMetricsMap.values()) { + Map regionMetricsMap = serverMetrics.getRegionMetrics(); + for (RegionMetrics regionMetrics : regionMetricsMap.values()) { + LOG.debug("name: " + new String(regionMetrics.getRegionName()) + " refCount: " + + regionMetrics.getStoreRefCount()); + } + } + Mockito.when(hMaster.getClusterMetrics()).thenReturn(clusterMetrics); + Mockito.when(hMaster.getAssignmentManager()).thenReturn(assignmentManager); + for (byte[] regionName : REGION_NAME_LIST) { + Mockito.when(assignmentManager.getRegionInfo(regionName)) + .thenReturn(TestRegionsRecoveryChore.getRegionInfo(regionName)); + } + Stoppable stoppable = new StoppableImplementation(); + Configuration configuration = getCustomConf(); + configuration.setInt("hbase.regions.recovery.store.file.ref.count", 400); + regionsRecoveryChore = new RegionsRecoveryChore(stoppable, configuration, hMaster); + regionsRecoveryChore.chore(); + + // Verify that we need to reopen regions of only 1 table + Mockito.verify(hMaster, Mockito.times(1)).reopenRegions(Mockito.any(), Mockito.anyList(), + Mockito.anyLong(), Mockito.anyLong()); + Mockito.verify(hMaster, Mockito.times(1)).getClusterMetrics(); + + // Verify that we need to reopen only 1 region with refCount > 400 + Mockito.verify(hMaster, Mockito.times(1)).getAssignmentManager(); + Mockito.verify(assignmentManager, Mockito.times(1)) + .getRegionInfo(Mockito.any()); + } + + @Test + public void testRegionReopensWithoutStoreRefConfig() throws Exception { + regionNo = 0; + ClusterMetrics clusterMetrics = TestRegionsRecoveryChore.getClusterMetrics(10); + final Map serverMetricsMap = + clusterMetrics.getLiveServerMetrics(); + LOG.debug("All Region Names with refCount...."); + for (ServerMetrics serverMetrics : serverMetricsMap.values()) { + Map regionMetricsMap = serverMetrics.getRegionMetrics(); + for (RegionMetrics regionMetrics : regionMetricsMap.values()) { + LOG.debug("name: " + new String(regionMetrics.getRegionName()) + " refCount: " + + regionMetrics.getStoreRefCount()); + } + } + Mockito.when(hMaster.getClusterMetrics()).thenReturn(clusterMetrics); + Mockito.when(hMaster.getAssignmentManager()).thenReturn(assignmentManager); + for (byte[] regionName : REGION_NAME_LIST) { + Mockito.when(assignmentManager.getRegionInfo(regionName)) + .thenReturn(TestRegionsRecoveryChore.getRegionInfo(regionName)); + } + Stoppable stoppable = new StoppableImplementation(); + Configuration configuration = getCustomConf(); + configuration.unset("hbase.regions.recovery.store.file.ref.count"); + regionsRecoveryChore = new RegionsRecoveryChore(stoppable, configuration, hMaster); + regionsRecoveryChore.chore(); + + // Verify that by default the feature is turned off so no regions + // should be reopened + Mockito.verify(hMaster, Mockito.times(0)).reopenRegions(Mockito.any(), Mockito.anyList(), + Mockito.anyLong(), Mockito.anyLong()); + + // default maxStoreFileRefCount is -1 (no regions to be reopened using AM) + Mockito.verify(hMaster, Mockito.times(0)).getAssignmentManager(); + Mockito.verify(assignmentManager, Mockito.times(0)) + .getRegionInfo(Mockito.any()); + } + + private static ClusterMetrics getClusterMetrics(int noOfLiveServer) { + ClusterMetrics clusterMetrics = new ClusterMetrics() { + + @Nullable + @Override + public String getHBaseVersion() { + return null; + } + + @Override + public List getDeadServerNames() { + return null; + } + + @Override + public Map getLiveServerMetrics() { + Map liveServerMetrics = new HashMap<>(); + for (int i = 0; i < noOfLiveServer; i++) { + ServerName serverName = ServerName.valueOf("rs_" + i, 16010, 12345); + liveServerMetrics.put(serverName, TestRegionsRecoveryChore.getServerMetrics(i + 3)); + } + return liveServerMetrics; + } + + @Nullable + @Override + public ServerName getMasterName() { + return null; + } + + @Override + public List getBackupMasterNames() { + return null; + } + + @Override + public List getRegionStatesInTransition() { + return null; + } + + @Nullable + @Override + public String getClusterId() { + return null; + } + + @Override + public List getMasterCoprocessorNames() { + return null; + } + + @Nullable + @Override + public Boolean getBalancerOn() { + return null; + } + + @Override + public int getMasterInfoPort() { + return 0; + } + + @Override + public List getServersName() { + return null; + } + + @Override + public Map getTableRegionStatesCount() { + return null; + } + + }; + return clusterMetrics; + } + + private static ServerMetrics getServerMetrics(int noOfRegions) { + ServerMetrics serverMetrics = new ServerMetrics() { + + @Override + public ServerName getServerName() { + return null; + } + + @Override + public long getRequestCountPerSecond() { + return 0; + } + + @Override + public long getRequestCount() { + return 0; + } + + @Override + public Size getUsedHeapSize() { + return null; + } + + @Override + public Size getMaxHeapSize() { + return null; + } + + @Override + public int getInfoServerPort() { + return 0; + } + + @Override + public List getReplicationLoadSourceList() { + return null; + } + + @Override + public Map> getReplicationLoadSourceMap() { + return null; + } + + @Nullable + @Override + public ReplicationLoadSink getReplicationLoadSink() { + return null; + } + + @Override + public Map getRegionMetrics() { + Map regionMetricsMap = new HashMap<>(); + for (int i = 0; i < noOfRegions; i++) { + byte[] regionName = Bytes.toBytes("region" + regionNo + "_" + i); + regionMetricsMap.put(regionName, + TestRegionsRecoveryChore.getRegionMetrics(regionName, 100 * i)); + ++regionNo; + } + return regionMetricsMap; + } + + @Override + public Set getCoprocessorNames() { + return null; + } + + @Override + public long getReportTimestamp() { + return 0; + } + + @Override + public long getLastReportTimestamp() { + return 0; + } + + }; + return serverMetrics; + } + + private static RegionMetrics getRegionMetrics(byte[] regionName, int storeRefCount) { + RegionMetrics regionMetrics = new RegionMetrics() { + + @Override + public byte[] getRegionName() { + return regionName; + } + + @Override + public int getStoreCount() { + return 0; + } + + @Override + public int getStoreFileCount() { + return 0; + } + + @Override + public Size getStoreFileSize() { + return null; + } + + @Override + public Size getMemStoreSize() { + return null; + } + + @Override + public long getReadRequestCount() { + return 0; + } + + @Override + public long getWriteRequestCount() { + return 0; + } + + @Override + public long getCpRequestCount() { + return 0; + } + + @Override + public long getFilteredReadRequestCount() { + return 0; + } + + @Override + public Size getStoreFileIndexSize() { + return null; + } + + @Override + public Size getStoreFileRootLevelIndexSize() { + return null; + } + + @Override + public Size getStoreFileUncompressedDataIndexSize() { + return null; + } + + @Override + public Size getBloomFilterSize() { + return null; + } + + @Override + public long getCompactingCellCount() { + return 0; + } + + @Override + public long getCompactedCellCount() { + return 0; + } + + @Override + public long getCompletedSequenceId() { + return 0; + } + + @Override + public Map getStoreSequenceId() { + return null; + } + + @Override + public Size getUncompressedStoreFileSize() { + return null; + } + + @Override + public float getDataLocality() { + return 0; + } + + @Override + public long getLastMajorCompactionTimestamp() { + return 0; + } + + @Override + public int getStoreRefCount() { + return storeRefCount; + } + + @Override + public int getMaxStoreFileRefCount() { + return storeRefCount; + } + + }; + return regionMetrics; + } + + private static RegionInfo getRegionInfo(byte[] regionNameBytes) { + RegionInfo regionInfo = new RegionInfo() { + + @Override + public String getShortNameToLog() { + return null; + } + + @Override + public long getRegionId() { + return 0; + } + + @Override + public byte[] getRegionName() { + return new byte[0]; + } + + @Override + public String getRegionNameAsString() { + try { + return new String(regionNameBytes, UTF_8_CHARSET); + } catch (UnsupportedEncodingException e) { + return ""; + } + } + + @Override + public String getEncodedName() { + return null; + } + + @Override + public byte[] getEncodedNameAsBytes() { + return new byte[0]; + } + + @Override + public byte[] getStartKey() { + return new byte[0]; + } + + @Override + public byte[] getEndKey() { + return new byte[0]; + } + + @Override + public TableName getTable() { + String regionName; + try { + regionName = new String(regionNameBytes, UTF_8_CHARSET); + } catch (UnsupportedEncodingException e) { + regionName = ""; + } + int regionNo = Integer.parseInt(regionName.split("_")[1]); + TableName tableName = TableName.valueOf("table_" + regionNo % 3); + return tableName; + } + + @Override + public int getReplicaId() { + return 0; + } + + @Override + public boolean isSplit() { + return false; + } + + @Override + public boolean isOffline() { + return false; + } + + @Override + public boolean isSplitParent() { + return false; + } + + @Override + public boolean isMetaRegion() { + return false; + } + + @Override + public boolean containsRange(byte[] rangeStartKey, byte[] rangeEndKey) { + return false; + } + + @Override + public boolean containsRow(byte[] row) { + return false; + } + + }; + return regionInfo; + } + + /** + * Simple helper class that just keeps track of whether or not its stopped. + */ + private static class StoppableImplementation implements Stoppable { + + private volatile boolean stop = false; + + @Override + public void stop(String why) { + this.stop = true; + } + + @Override + public boolean isStopped() { + return this.stop; + } + + } + +} \ No newline at end of file diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapperStub.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapperStub.java index acdb7321ee9a..1e5cb3ed2e95 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapperStub.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapperStub.java @@ -65,6 +65,11 @@ public long getStoreRefCount() { return 0; } + @Override + public long getMaxStoreFileRefCount() { + return 0; + } + @Override public long getMemStoreSize() { return 103; diff --git a/src/main/asciidoc/_chapters/hbase-default.adoc b/src/main/asciidoc/_chapters/hbase-default.adoc index e7d3bce5e096..59a3a073c362 100644 --- a/src/main/asciidoc/_chapters/hbase-default.adoc +++ b/src/main/asciidoc/_chapters/hbase-default.adoc @@ -2161,3 +2161,43 @@ The percent of region server RPC threads failed to abort RS. + .Default `0` + + +[[hbase.master.regions.recovery.check.interval]] +*`hbase.master.regions.recovery.check.interval`*:: ++ +.Description + + Regions Recovery Chore interval in milliseconds. + This chore keeps running at this interval to + find all regions with configurable max store file ref count + and reopens them. + ++ +.Default +`1200000` + + +[[hbase.regions.recovery.store.file.ref.count]] +*`hbase.regions.recovery.store.file.ref.count`*:: ++ +.Description + + Very large ref count on a file indicates + that it is a ref leak on that object. Such files + can not be removed even after it is invalidated + via compaction. Only way to recover in such + scenario is to reopen the region which can + release all resources, like the refcount, leases, etc. + This config represents Store files Ref Count threshold + value considered for reopening regions. + Any region with store files ref count > this value + would be eligible for reopening by master. + Default value -1 indicates this feature is turned off. + Only positive integer value should be provided to enable + this feature. + ++ +.Default +`-1` + diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc index 4b656f4a723c..616b02056c9f 100644 --- a/src/main/asciidoc/_chapters/ops_mgt.adoc +++ b/src/main/asciidoc/_chapters/ops_mgt.adoc @@ -3618,3 +3618,26 @@ have auto normalization turned on …..…..…. server=hbase-test-rc-5.openstacklocal,16020,1469419333913} 2016-07-26 07:39:46,246 DEBUG [AM.ZK.Worker-pool2-t278] master.RegionStates: Onlined d6b5625df331cfec84dce4f1122c567f on hbase-test-rc-5.openstacklocal,16020,1469419333913 {ENCODED => d6b5625df331cfec84dce4f1122c567f, NAME => 'table_h2osxu3wat,154717,1469518785661.d6b5625df331cfec84dce4f1122c567f.', STARTKEY => '154717', ENDKEY => '3'} ---- + + + +[[auto_reopen_regions]] +== Auto Region Reopen + +We can leak store reader references if a coprocessor or core function somehow +opens a scanner, or wraps one, and then does not take care to call close on the +scanner or the wrapped instance. Leaked store files can not be removed even +after it is invalidated via compaction. +A reasonable mitigation for a reader reference +leak would be a fast reopen of the region on the same server. +This will release all resources, like the refcount, leases, etc. +The clients should gracefully ride over this like any other region in +transition. +By default this auto reopen of region feature would be disabled. +To enabled it, please provide high ref count value for config +`hbase.regions.recovery.store.file.ref.count`. + +Please refer to config descriptions for +`hbase.master.regions.recovery.check.interval` and +`hbase.regions.recovery.store.file.ref.count`. +