Skip to content

Commit

Permalink
HBASE-22460 : Reopen regions with very high Store Ref Counts (#600)
Browse files Browse the repository at this point in the history
Signed-off-by Anoop Sam John <[email protected]>
  • Loading branch information
virajjasani authored and anoopsjohn committed Oct 23, 2019
1 parent 2ad62b0 commit 14dcf1d
Show file tree
Hide file tree
Showing 21 changed files with 1,077 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -154,4 +154,10 @@ default String getNameAsString() {
* @return the reference count for the stores of this region
*/
int getStoreRefCount();

/**
* @return the max reference count for any store file among all stores files
* of this region
*/
int getMaxStoreFileRefCount();
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public static RegionMetrics toRegionMetrics(ClusterStatusProtos.RegionLoad regio
.setStoreCount(regionLoadPB.getStores())
.setStoreFileCount(regionLoadPB.getStorefiles())
.setStoreRefCount(regionLoadPB.getStoreRefCount())
.setMaxStoreFileRefCount(regionLoadPB.getMaxStoreFileRefCount())
.setStoreFileSize(new Size(regionLoadPB.getStorefileSizeMB(), Size.Unit.MEGABYTE))
.setStoreSequenceIds(regionLoadPB.getStoreCompleteSequenceIdList().stream()
.collect(Collectors.toMap(
Expand Down Expand Up @@ -113,6 +114,7 @@ public static ClusterStatusProtos.RegionLoad toRegionLoad(RegionMetrics regionMe
.setStores(regionMetrics.getStoreCount())
.setStorefiles(regionMetrics.getStoreFileCount())
.setStoreRefCount(regionMetrics.getStoreRefCount())
.setMaxStoreFileRefCount(regionMetrics.getMaxStoreFileRefCount())
.setStorefileSizeMB((int) regionMetrics.getStoreFileSize().get(Size.Unit.MEGABYTE))
.addAllStoreCompleteSequenceId(toStoreSequenceId(regionMetrics.getStoreSequenceId()))
.setStoreUncompressedSizeMB(
Expand All @@ -128,6 +130,7 @@ public static RegionMetricsBuilder newBuilder(byte[] name) {
private int storeCount;
private int storeFileCount;
private int storeRefCount;
private int maxStoreFileRefCount;
private long compactingCellCount;
private long compactedCellCount;
private Size storeFileSize = Size.ZERO;
Expand Down Expand Up @@ -161,6 +164,10 @@ public RegionMetricsBuilder setStoreRefCount(int value) {
this.storeRefCount = value;
return this;
}
public RegionMetricsBuilder setMaxStoreFileRefCount(int value) {
this.maxStoreFileRefCount = value;
return this;
}
public RegionMetricsBuilder setCompactingCellCount(long value) {
this.compactingCellCount = value;
return this;
Expand Down Expand Up @@ -235,6 +242,7 @@ public RegionMetrics build() {
storeCount,
storeFileCount,
storeRefCount,
maxStoreFileRefCount,
compactingCellCount,
compactedCellCount,
storeFileSize,
Expand All @@ -259,6 +267,7 @@ private static class RegionMetricsImpl implements RegionMetrics {
private final int storeCount;
private final int storeFileCount;
private final int storeRefCount;
private final int maxStoreFileRefCount;
private final long compactingCellCount;
private final long compactedCellCount;
private final Size storeFileSize;
Expand All @@ -280,6 +289,7 @@ private static class RegionMetricsImpl implements RegionMetrics {
int storeCount,
int storeFileCount,
int storeRefCount,
int maxStoreFileRefCount,
final long compactingCellCount,
long compactedCellCount,
Size storeFileSize,
Expand All @@ -301,6 +311,7 @@ private static class RegionMetricsImpl implements RegionMetrics {
this.storeCount = storeCount;
this.storeFileCount = storeFileCount;
this.storeRefCount = storeRefCount;
this.maxStoreFileRefCount = maxStoreFileRefCount;
this.compactingCellCount = compactingCellCount;
this.compactedCellCount = compactedCellCount;
this.storeFileSize = Preconditions.checkNotNull(storeFileSize);
Expand Down Expand Up @@ -340,6 +351,11 @@ public int getStoreRefCount() {
return storeRefCount;
}

@Override
public int getMaxStoreFileRefCount() {
return maxStoreFileRefCount;
}

@Override
public Size getStoreFileSize() {
return storeFileSize;
Expand Down Expand Up @@ -433,6 +449,8 @@ public String toString() {
this.getStoreFileCount());
Strings.appendKeyValue(sb, "storeRefCount",
this.getStoreRefCount());
Strings.appendKeyValue(sb, "maxStoreFileRefCount",
this.getMaxStoreFileRefCount());
Strings.appendKeyValue(sb, "uncompressedStoreFileSize",
this.getUncompressedStoreFileSize());
Strings.appendKeyValue(sb, "lastMajorCompactionTimestamp",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,8 @@ public long getLastReportTimestamp() {
public String toString() {
int storeCount = 0;
int storeFileCount = 0;
int storeRefCount = 0;
int maxStoreFileRefCount = 0;
long uncompressedStoreFileSizeMB = 0;
long storeFileSizeMB = 0;
long memStoreSizeMB = 0;
Expand All @@ -358,6 +360,9 @@ public String toString() {
for (RegionMetrics r : getRegionMetrics().values()) {
storeCount += r.getStoreCount();
storeFileCount += r.getStoreFileCount();
storeRefCount += r.getStoreRefCount();
int currentMaxStoreFileRefCount = r.getMaxStoreFileRefCount();
maxStoreFileRefCount = Math.max(maxStoreFileRefCount, currentMaxStoreFileRefCount);
uncompressedStoreFileSizeMB += r.getUncompressedStoreFileSize().get(Size.Unit.MEGABYTE);
storeFileSizeMB += r.getStoreFileSize().get(Size.Unit.MEGABYTE);
memStoreSizeMB += r.getMemStoreSize().get(Size.Unit.MEGABYTE);
Expand All @@ -379,6 +384,8 @@ public String toString() {
Strings.appendKeyValue(sb, "maxHeapMB", getMaxHeapSize());
Strings.appendKeyValue(sb, "numberOfStores", storeCount);
Strings.appendKeyValue(sb, "numberOfStorefiles", storeFileCount);
Strings.appendKeyValue(sb, "storeRefCount", storeRefCount);
Strings.appendKeyValue(sb, "maxStoreFileRefCount", maxStoreFileRefCount);
Strings.appendKeyValue(sb, "storefileUncompressedSizeMB", uncompressedStoreFileSizeMB);
Strings.appendKeyValue(sb, "storefileSizeMB", storeFileSizeMB);
if (uncompressedStoreFileSizeMB != 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1474,6 +1474,13 @@ public enum OperationStatusCode {
// User defined Default TTL config key
public static final String DEFAULT_SNAPSHOT_TTL_CONFIG_KEY = "hbase.master.snapshot.ttl";

// Regions Recovery based on high storeFileRefCount threshold value
public static final String STORE_FILE_REF_COUNT_THRESHOLD =
"hbase.regions.recovery.store.file.ref.count";

// default -1 indicates there is no threshold on high storeRefCount
public static final int DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD = -1;

/**
* Configurations for master executor services.
*/
Expand Down
29 changes: 29 additions & 0 deletions hbase-common/src/main/resources/hbase-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1901,4 +1901,33 @@ possible configurations would overwhelm and obscure the important.
automatically deleted until it is manually deleted
</description>
</property>
<property>
<name>hbase.master.regions.recovery.check.interval</name>
<value>1200000</value>
<description>
Regions Recovery Chore interval in milliseconds.
This chore keeps running at this interval to
find all regions with configurable max store file ref count
and reopens them.
</description>
</property>
<property>
<name>hbase.regions.recovery.store.file.ref.count</name>
<value>-1</value>
<description>
Very large ref count on a file indicates
that it is a ref leak on that object. Such files
can not be removed even after it is invalidated
via compaction. Only way to recover in such
scenario is to reopen the region which can
release all resources, like the refcount, leases, etc.
This config represents Store files Ref Count threshold
value considered for reopening regions.
Any region with store files ref count > this value
would be eligible for reopening by master.
Default value -1 indicates this feature is turned off.
Only positive integer value should be provided to enable
this feature.
</description>
</property>
</configuration>
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSo
String STOREFILE_COUNT_DESC = "Number of Store Files";
String STORE_REF_COUNT = "storeRefCount";
String STORE_REF_COUNT_DESC = "Store reference count";
String MAX_STORE_FILE_REF_COUNT = "maxStoreFileRefCount";
String MEMSTORE_SIZE = "memStoreSize";
String MEMSTORE_SIZE_DESC = "Size of the memstore";
String STOREFILE_SIZE = "storeFileSize";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,10 @@ public interface MetricsRegionWrapper {
* @return the number of references active on the store
*/
long getStoreRefCount();

/**
* @return the max number of references active on any store file among
* all store files that belong to this region
*/
long getMaxStoreFileRefCount();
}
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,10 @@ void snapshot(MetricsRecordBuilder mrb, boolean ignored) {
regionNamePrefix + MetricsRegionServerSource.STORE_REF_COUNT,
MetricsRegionServerSource.STORE_REF_COUNT),
this.regionWrapper.getStoreRefCount());
mrb.addGauge(Interns.info(
regionNamePrefix + MetricsRegionServerSource.MAX_STORE_FILE_REF_COUNT,
MetricsRegionServerSource.MAX_STORE_FILE_REF_COUNT),
this.regionWrapper.getMaxStoreFileRefCount());
mrb.addGauge(Interns.info(
regionNamePrefix + MetricsRegionServerSource.MEMSTORE_SIZE,
MetricsRegionServerSource.MEMSTORE_SIZE_DESC),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ public long getStoreRefCount() {
return 0;
}

@Override
public long getMaxStoreFileRefCount() {
return 0;
}

@Override
public long getMemStoreSize() {
return 0;
Expand Down
6 changes: 6 additions & 0 deletions hbase-protocol-shaded/src/main/protobuf/ClusterStatus.proto
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,12 @@ message RegionLoad {

/** the number of references active on the store */
optional int32 store_ref_count = 21 [default = 0];

/**
* The max number of references active on single store file among all store files
* that belong to given region
*/
optional int32 max_store_file_ref_count = 22 [default = 0];
}

/* Server-level protobufs */
Expand Down
6 changes: 6 additions & 0 deletions hbase-protocol/src/main/protobuf/ClusterStatus.proto
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ message RegionLoad {

/** the number of references active on the store */
optional int32 store_ref_count = 21 [default = 0];

/**
* The max number of references active on single store file among all store files
* that belong to given region
*/
optional int32 max_store_file_ref_count = 22 [default = 0];
}

/* Server-level protobufs */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@
import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch;
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
import org.apache.hadoop.hbase.master.procedure.RecoverMetaProcedure;
import org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.master.procedure.TruncateTableProcedure;
import org.apache.hadoop.hbase.master.replication.AbstractPeerProcedure;
Expand Down Expand Up @@ -421,6 +422,7 @@ public void run() {
// monitor for distributed procedures
private MasterProcedureManagerHost mpmHost;

private RegionsRecoveryChore regionsRecoveryChore = null;
// it is assigned after 'initialized' guard set to true, so should be volatile
private volatile MasterQuotaManager quotaManager;
private SpaceQuotaSnapshotNotifier spaceQuotaSnapshotNotifier;
Expand Down Expand Up @@ -1464,6 +1466,20 @@ private void startServiceThreads() throws IOException {
getMasterFileSystem().getFileSystem(), archiveDir, cleanerPool, params);
getChoreService().scheduleChore(hfileCleaner);

// Regions Reopen based on very high storeFileRefCount is considered enabled
// only if hbase.regions.recovery.store.file.ref.count has value > 0
final int maxStoreFileRefCount = conf.getInt(
HConstants.STORE_FILE_REF_COUNT_THRESHOLD,
HConstants.DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD);
if (maxStoreFileRefCount > 0) {
this.regionsRecoveryChore = new RegionsRecoveryChore(this, conf, this);
getChoreService().scheduleChore(this.regionsRecoveryChore);
} else {
LOG.info("Reopening regions with very high storeFileRefCount is disabled. " +
"Provide threshold value > 0 for {} to enable it.",
HConstants.STORE_FILE_REF_COUNT_THRESHOLD);
}

replicationBarrierCleaner = new ReplicationBarrierCleaner(conf, this, getConnection(),
replicationPeerManager);
getChoreService().scheduleChore(replicationBarrierCleaner);
Expand Down Expand Up @@ -1631,6 +1647,7 @@ private void stopChores() {
choreService.cancelChore(this.replicationBarrierCleaner);
choreService.cancelChore(this.snapshotCleanerChore);
choreService.cancelChore(this.hbckChore);
choreService.cancelChore(this.regionsRecoveryChore);
}
}

Expand Down Expand Up @@ -3732,6 +3749,38 @@ public void remoteProcedureFailed(long procId, RemoteProcedureException error) {
}
}

/**
* Reopen regions provided in the argument
*
* @param tableName The current table name
* @param regionNames The region names of the regions to reopen
* @param nonceGroup Identifier for the source of the request, a client or process
* @param nonce A unique identifier for this operation from the client or process identified by
* <code>nonceGroup</code> (the source must ensure each operation gets a unique id).
* @return procedure Id
* @throws IOException if reopening region fails while running procedure
*/
long reopenRegions(final TableName tableName, final List<byte[]> regionNames,
final long nonceGroup, final long nonce)
throws IOException {

return MasterProcedureUtil
.submitProcedure(new MasterProcedureUtil.NonceProcedureRunnable(this, nonceGroup, nonce) {

@Override
protected void run() throws IOException {
submitProcedure(new ReopenTableRegionsProcedure(tableName, regionNames));
}

@Override
protected String getDescription() {
return "ReopenTableRegionsProcedure";
}

});

}

@Override
public ReplicationPeerManager getReplicationPeerManager() {
return replicationPeerManager;
Expand Down
Loading

0 comments on commit 14dcf1d

Please sign in to comment.