Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HBASE-22460 : Reopen regions with very high Store Ref Counts #600

Merged
merged 2 commits into from
Oct 23, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -154,4 +154,10 @@ default String getNameAsString() {
* @return the reference count for the stores of this region
*/
int getStoreRefCount();

/**
* @return the max reference count for any store file among all stores files
* of this region
*/
int getMaxStoreFileRefCount();
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public static RegionMetrics toRegionMetrics(ClusterStatusProtos.RegionLoad regio
.setStoreCount(regionLoadPB.getStores())
.setStoreFileCount(regionLoadPB.getStorefiles())
.setStoreRefCount(regionLoadPB.getStoreRefCount())
.setMaxStoreFileRefCount(regionLoadPB.getMaxStoreFileRefCount())
.setStoreFileSize(new Size(regionLoadPB.getStorefileSizeMB(), Size.Unit.MEGABYTE))
.setStoreSequenceIds(regionLoadPB.getStoreCompleteSequenceIdList().stream()
.collect(Collectors.toMap(
Expand Down Expand Up @@ -113,6 +114,7 @@ public static ClusterStatusProtos.RegionLoad toRegionLoad(RegionMetrics regionMe
.setStores(regionMetrics.getStoreCount())
.setStorefiles(regionMetrics.getStoreFileCount())
.setStoreRefCount(regionMetrics.getStoreRefCount())
.setMaxStoreFileRefCount(regionMetrics.getMaxStoreFileRefCount())
.setStorefileSizeMB((int) regionMetrics.getStoreFileSize().get(Size.Unit.MEGABYTE))
.addAllStoreCompleteSequenceId(toStoreSequenceId(regionMetrics.getStoreSequenceId()))
.setStoreUncompressedSizeMB(
Expand All @@ -128,6 +130,7 @@ public static RegionMetricsBuilder newBuilder(byte[] name) {
private int storeCount;
private int storeFileCount;
private int storeRefCount;
private int maxStoreFileRefCount;
private long compactingCellCount;
private long compactedCellCount;
private Size storeFileSize = Size.ZERO;
Expand Down Expand Up @@ -161,6 +164,10 @@ public RegionMetricsBuilder setStoreRefCount(int value) {
this.storeRefCount = value;
return this;
}
public RegionMetricsBuilder setMaxStoreFileRefCount(int value) {
this.maxStoreFileRefCount = value;
return this;
}
public RegionMetricsBuilder setCompactingCellCount(long value) {
this.compactingCellCount = value;
return this;
Expand Down Expand Up @@ -235,6 +242,7 @@ public RegionMetrics build() {
storeCount,
storeFileCount,
storeRefCount,
maxStoreFileRefCount,
compactingCellCount,
compactedCellCount,
storeFileSize,
Expand All @@ -259,6 +267,7 @@ private static class RegionMetricsImpl implements RegionMetrics {
private final int storeCount;
private final int storeFileCount;
private final int storeRefCount;
private final int maxStoreFileRefCount;
private final long compactingCellCount;
private final long compactedCellCount;
private final Size storeFileSize;
Expand All @@ -280,6 +289,7 @@ private static class RegionMetricsImpl implements RegionMetrics {
int storeCount,
int storeFileCount,
int storeRefCount,
int maxStoreFileRefCount,
final long compactingCellCount,
long compactedCellCount,
Size storeFileSize,
Expand All @@ -301,6 +311,7 @@ private static class RegionMetricsImpl implements RegionMetrics {
this.storeCount = storeCount;
this.storeFileCount = storeFileCount;
this.storeRefCount = storeRefCount;
this.maxStoreFileRefCount = maxStoreFileRefCount;
this.compactingCellCount = compactingCellCount;
this.compactedCellCount = compactedCellCount;
this.storeFileSize = Preconditions.checkNotNull(storeFileSize);
Expand Down Expand Up @@ -340,6 +351,11 @@ public int getStoreRefCount() {
return storeRefCount;
}

@Override
public int getMaxStoreFileRefCount() {
return maxStoreFileRefCount;
}

@Override
public Size getStoreFileSize() {
return storeFileSize;
Expand Down Expand Up @@ -433,6 +449,8 @@ public String toString() {
this.getStoreFileCount());
Strings.appendKeyValue(sb, "storeRefCount",
this.getStoreRefCount());
Strings.appendKeyValue(sb, "maxStoreFileRefCount",
this.getMaxStoreFileRefCount());
Strings.appendKeyValue(sb, "uncompressedStoreFileSize",
this.getUncompressedStoreFileSize());
Strings.appendKeyValue(sb, "lastMajorCompactionTimestamp",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,8 @@ public long getLastReportTimestamp() {
public String toString() {
int storeCount = 0;
int storeFileCount = 0;
int storeRefCount = 0;
int maxStoreFileRefCount = 0;
long uncompressedStoreFileSizeMB = 0;
long storeFileSizeMB = 0;
long memStoreSizeMB = 0;
Expand All @@ -358,6 +360,9 @@ public String toString() {
for (RegionMetrics r : getRegionMetrics().values()) {
storeCount += r.getStoreCount();
storeFileCount += r.getStoreFileCount();
storeRefCount += r.getStoreRefCount();
int currentMaxStoreFileRefCount = r.getMaxStoreFileRefCount();
maxStoreFileRefCount = Math.max(maxStoreFileRefCount, currentMaxStoreFileRefCount);
uncompressedStoreFileSizeMB += r.getUncompressedStoreFileSize().get(Size.Unit.MEGABYTE);
storeFileSizeMB += r.getStoreFileSize().get(Size.Unit.MEGABYTE);
memStoreSizeMB += r.getMemStoreSize().get(Size.Unit.MEGABYTE);
Expand All @@ -379,6 +384,8 @@ public String toString() {
Strings.appendKeyValue(sb, "maxHeapMB", getMaxHeapSize());
Strings.appendKeyValue(sb, "numberOfStores", storeCount);
Strings.appendKeyValue(sb, "numberOfStorefiles", storeFileCount);
Strings.appendKeyValue(sb, "storeRefCount", storeRefCount);
Strings.appendKeyValue(sb, "maxStoreFileRefCount", maxStoreFileRefCount);
Strings.appendKeyValue(sb, "storefileUncompressedSizeMB", uncompressedStoreFileSizeMB);
Strings.appendKeyValue(sb, "storefileSizeMB", storeFileSizeMB);
if (uncompressedStoreFileSizeMB != 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1470,6 +1470,13 @@ public enum OperationStatusCode {
// User defined Default TTL config key
public static final String DEFAULT_SNAPSHOT_TTL_CONFIG_KEY = "hbase.master.snapshot.ttl";

// Regions Recovery based on high storeFileRefCount threshold value
public static final String STORE_FILE_REF_COUNT_THRESHOLD =
"hbase.regions.recovery.store.file.ref.count";

// default -1 indicates there is no threshold on high storeRefCount
public static final int DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD = -1;

/**
* Configurations for master executor services.
*/
Expand Down
29 changes: 29 additions & 0 deletions hbase-common/src/main/resources/hbase-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1901,4 +1901,33 @@ possible configurations would overwhelm and obscure the important.
automatically deleted until it is manually deleted
</description>
</property>
<property>
<name>hbase.master.regions.recovery.check.interval</name>
<value>1200000</value>
<description>
Regions Recovery Chore interval in milliseconds.
This chore keeps running at this interval to
find all regions with configurable max store file ref count
and reopens them.
</description>
</property>
<property>
<name>hbase.regions.recovery.store.file.ref.count</name>
<value>-1</value>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should allow users to change this with out restarting the HM. But on a followup issue

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you mean similar to _switch command?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No.. Via ConfigurationObserver way and allow changes to be impacted with out restart of process

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, looking forward to it on follow up Jira

<description>
Very large ref count on a file indicates
that it is a ref leak on that object. Such files
can not be removed even after it is invalidated
via compaction. Only way to recover in such
scenario is to reopen the region which can
release all resources, like the refcount, leases, etc.
This config represents Store files Ref Count threshold
value considered for reopening regions.
Any region with store files ref count > this value
would be eligible for reopening by master.
Default value -1 indicates this feature is turned off.
Only positive integer value should be provided to enable
this feature.
</description>
</property>
</configuration>
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSo
String STOREFILE_COUNT_DESC = "Number of Store Files";
String STORE_REF_COUNT = "storeRefCount";
String STORE_REF_COUNT_DESC = "Store reference count";
String MAX_STORE_FILE_REF_COUNT = "maxStoreFileRefCount";
String MEMSTORE_SIZE = "memStoreSize";
String MEMSTORE_SIZE_DESC = "Size of the memstore";
String STOREFILE_SIZE = "storeFileSize";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,10 @@ public interface MetricsRegionWrapper {
* @return the number of references active on the store
*/
long getStoreRefCount();

/**
* @return the max number of references active on any store file among
* all store files that belong to this region
*/
long getMaxStoreFileRefCount();
}
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,10 @@ void snapshot(MetricsRecordBuilder mrb, boolean ignored) {
regionNamePrefix + MetricsRegionServerSource.STORE_REF_COUNT,
MetricsRegionServerSource.STORE_REF_COUNT),
this.regionWrapper.getStoreRefCount());
mrb.addGauge(Interns.info(
regionNamePrefix + MetricsRegionServerSource.MAX_STORE_FILE_REF_COUNT,
MetricsRegionServerSource.MAX_STORE_FILE_REF_COUNT),
this.regionWrapper.getMaxStoreFileRefCount());
mrb.addGauge(Interns.info(
regionNamePrefix + MetricsRegionServerSource.MEMSTORE_SIZE,
MetricsRegionServerSource.MEMSTORE_SIZE_DESC),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ public long getStoreRefCount() {
return 0;
}

@Override
public long getMaxStoreFileRefCount() {
return 0;
}

@Override
public long getMemStoreSize() {
return 0;
Expand Down
6 changes: 6 additions & 0 deletions hbase-protocol-shaded/src/main/protobuf/ClusterStatus.proto
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,12 @@ message RegionLoad {

/** the number of references active on the store */
optional int32 store_ref_count = 21 [default = 0];

/**
* The max number of references active on single store file among all store files
* that belong to given region
*/
optional int32 max_store_file_ref_count = 22 [default = 0];
}

/* Server-level protobufs */
Expand Down
6 changes: 6 additions & 0 deletions hbase-protocol/src/main/protobuf/ClusterStatus.proto
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ message RegionLoad {

/** the number of references active on the store */
optional int32 store_ref_count = 21 [default = 0];

/**
* The max number of references active on single store file among all store files
* that belong to given region
*/
optional int32 max_store_file_ref_count = 22 [default = 0];
}

/* Server-level protobufs */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@
import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch;
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
import org.apache.hadoop.hbase.master.procedure.RecoverMetaProcedure;
import org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.master.procedure.TruncateTableProcedure;
import org.apache.hadoop.hbase.master.replication.AbstractPeerProcedure;
Expand Down Expand Up @@ -420,6 +421,7 @@ public void run() {
// monitor for distributed procedures
private MasterProcedureManagerHost mpmHost;

private RegionsRecoveryChore regionsRecoveryChore = null;
// it is assigned after 'initialized' guard set to true, so should be volatile
private volatile MasterQuotaManager quotaManager;
private SpaceQuotaSnapshotNotifier spaceQuotaSnapshotNotifier;
Expand Down Expand Up @@ -1459,6 +1461,20 @@ private void startServiceThreads() throws IOException {
getMasterFileSystem().getFileSystem(), archiveDir, cleanerPool, params);
getChoreService().scheduleChore(hfileCleaner);

// Regions Reopen based on very high storeFileRefCount is considered enabled
// only if hbase.regions.recovery.store.file.ref.count has value > 0
final int maxStoreFileRefCount = conf.getInt(
HConstants.STORE_FILE_REF_COUNT_THRESHOLD,
HConstants.DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD);
if (maxStoreFileRefCount > 0) {
this.regionsRecoveryChore = new RegionsRecoveryChore(this, conf, this);
getChoreService().scheduleChore(this.regionsRecoveryChore);
} else {
LOG.info("Reopening regions with very high storeFileRefCount is disabled. " +
"Provide threshold value > 0 for {} to enable it.",
HConstants.STORE_FILE_REF_COUNT_THRESHOLD);
}

replicationBarrierCleaner = new ReplicationBarrierCleaner(conf, this, getConnection(),
replicationPeerManager);
getChoreService().scheduleChore(replicationBarrierCleaner);
Expand Down Expand Up @@ -1626,6 +1642,7 @@ private void stopChores() {
choreService.cancelChore(this.replicationBarrierCleaner);
choreService.cancelChore(this.snapshotCleanerChore);
choreService.cancelChore(this.hbckChore);
choreService.cancelChore(this.regionsRecoveryChore);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the chore was not scheduled, this cancel call wont make any issue right? Just confirming

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, it won't make any issues, I confirmed it

}
}

Expand Down Expand Up @@ -3727,6 +3744,38 @@ public void remoteProcedureFailed(long procId, RemoteProcedureException error) {
}
}

/**
* Reopen regions provided in the argument
*
* @param tableName The current table name
* @param regionNames The region names of the regions to reopen
* @param nonceGroup Identifier for the source of the request, a client or process
* @param nonce A unique identifier for this operation from the client or process identified by
* <code>nonceGroup</code> (the source must ensure each operation gets a unique id).
* @return procedure Id
* @throws IOException if reopening region fails while running procedure
*/
long reopenRegions(final TableName tableName, final List<byte[]> regionNames,
final long nonceGroup, final long nonce)
throws IOException {

return MasterProcedureUtil
.submitProcedure(new MasterProcedureUtil.NonceProcedureRunnable(this, nonceGroup, nonce) {

@Override
protected void run() throws IOException {
submitProcedure(new ReopenTableRegionsProcedure(tableName, regionNames));
}

@Override
protected String getDescription() {
return "ReopenTableRegionsProcedure";
}

});

}

@Override
public ReplicationPeerManager getReplicationPeerManager() {
return replicationPeerManager;
Expand Down
Loading