Skip to content

Commit

Permalink
HBASE-28522 UNASSIGN proc indefinitely stuck on dead rs (#5995)
Browse files Browse the repository at this point in the history
Signed-off-by: Viraj Jasani <[email protected]>
Reviewed-by: Ray Mattingly <[email protected]>
  • Loading branch information
Apache9 authored Jul 27, 2024
1 parent 43b1d78 commit 634b200
Show file tree
Hide file tree
Showing 8 changed files with 375 additions and 184 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -811,3 +811,12 @@ message CloseExcessRegionReplicasProcedureStateData {
required TableName table_name = 1;
required uint32 new_replica_count = 2;
}

enum CloseTableRegionsProcedureState {
CLOSE_TABLE_REGIONS_SCHEDULE = 1;
CLOSE_TABLE_REGIONS_CONFIRM = 2;
}

message CloseTableRegionsProcedureStateData {
required TableName table_name = 1;
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
Expand Down Expand Up @@ -1084,25 +1085,16 @@ public TransitRegionStateProcedure[] createUnassignProceduresForDisabling(TableN
.toArray(TransitRegionStateProcedure[]::new);
}

/**
* Called by ModifyTableProcedure to unassign all the excess region replicas for a table. Will
* skip submit unassign procedure if the region is in transition, so you may need to call this
* method multiple times.
* @param tableName the table for closing excess region replicas
* @param newReplicaCount the new replica count, should be less than current replica count
* @param submit for submitting procedure
* @return the number of regions in transition that we can not schedule unassign procedures
*/
public int submitUnassignProcedureForClosingExcessRegionReplicas(TableName tableName,
int newReplicaCount, Consumer<TransitRegionStateProcedure> submit) {
private int submitUnassignProcedure(TableName tableName,
Function<RegionStateNode, Boolean> shouldSubmit, Consumer<RegionStateNode> logRIT,
Consumer<TransitRegionStateProcedure> submit) {
int inTransitionCount = 0;
for (RegionStateNode regionNode : regionStates.getTableRegionStateNodes(tableName)) {
regionNode.lock();
try {
if (regionNode.getRegionInfo().getReplicaId() >= newReplicaCount) {
if (shouldSubmit.apply(regionNode)) {
if (regionNode.isInTransition()) {
LOG.debug("skip scheduling unassign procedure for {} when closing excess region "
+ "replicas since it is in transition", regionNode);
logRIT.accept(regionNode);
inTransitionCount++;
continue;
}
Expand All @@ -1119,12 +1111,46 @@ public int submitUnassignProcedureForClosingExcessRegionReplicas(TableName table
return inTransitionCount;
}

public int numberOfUnclosedExcessRegionReplicas(TableName tableName, int newReplicaCount) {
/**
* Called by DsiableTableProcedure to unassign all regions for a table. Will skip submit unassign
* procedure if the region is in transition, so you may need to call this method multiple times.
* @param tableName the table for closing excess region replicas
* @param submit for submitting procedure
* @return the number of regions in transition that we can not schedule unassign procedures
*/
public int submitUnassignProcedureForDisablingTable(TableName tableName,
Consumer<TransitRegionStateProcedure> submit) {
return submitUnassignProcedure(tableName, rn -> true,
rn -> LOG.debug("skip scheduling unassign procedure for {} when closing table regions "
+ "for disabling since it is in transition", rn),
submit);
}

/**
* Called by ModifyTableProcedure to unassign all the excess region replicas for a table. Will
* skip submit unassign procedure if the region is in transition, so you may need to call this
* method multiple times.
* @param tableName the table for closing excess region replicas
* @param newReplicaCount the new replica count, should be less than current replica count
* @param submit for submitting procedure
* @return the number of regions in transition that we can not schedule unassign procedures
*/
public int submitUnassignProcedureForClosingExcessRegionReplicas(TableName tableName,
int newReplicaCount, Consumer<TransitRegionStateProcedure> submit) {
return submitUnassignProcedure(tableName,
rn -> rn.getRegionInfo().getReplicaId() >= newReplicaCount,
rn -> LOG.debug("skip scheduling unassign procedure for {} when closing excess region "
+ "replicas since it is in transition", rn),
submit);
}

private int numberOfUnclosedRegions(TableName tableName,
Function<RegionStateNode, Boolean> shouldSubmit) {
int unclosed = 0;
for (RegionStateNode regionNode : regionStates.getTableRegionStateNodes(tableName)) {
regionNode.lock();
try {
if (regionNode.getRegionInfo().getReplicaId() >= newReplicaCount) {
if (shouldSubmit.apply(regionNode)) {
if (!regionNode.isInState(State.OFFLINE, State.CLOSED, State.SPLIT)) {
unclosed++;
}
Expand All @@ -1136,6 +1162,15 @@ public int numberOfUnclosedExcessRegionReplicas(TableName tableName, int newRepl
return unclosed;
}

public int numberOfUnclosedRegionsForDisabling(TableName tableName) {
return numberOfUnclosedRegions(tableName, rn -> true);
}

public int numberOfUnclosedExcessRegionReplicas(TableName tableName, int newReplicaCount) {
return numberOfUnclosedRegions(tableName,
rn -> rn.getRegionInfo().getReplicaId() >= newReplicaCount);
}

public SplitTableRegionProcedure createSplitProcedure(final RegionInfo regionToSplit,
final byte[] splitKey) throws IOException {
return new SplitTableRegionProcedure(getProcedureEnvironment(), regionToSplit, splitKey);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master.procedure;

import java.io.IOException;
import java.util.function.Consumer;
import org.apache.commons.lang3.mutable.MutableBoolean;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;

/**
* Base class for unassigning table regions.
*/
@InterfaceAudience.Private
public abstract class AbstractCloseTableRegionsProcedure<TState extends Enum<?>>
extends AbstractStateMachineTableProcedure<TState> {

private static final Logger LOG =
LoggerFactory.getLogger(AbstractCloseTableRegionsProcedure.class);

protected TableName tableName;

private RetryCounter retryCounter;

protected AbstractCloseTableRegionsProcedure() {
}

protected AbstractCloseTableRegionsProcedure(TableName tableName) {
this.tableName = tableName;
}

@Override
public TableName getTableName() {
return tableName;
}

@Override
public TableOperationType getTableOperationType() {
return TableOperationType.REGION_EDIT;
}

private Flow schedule(MasterProcedureEnv env) throws ProcedureSuspendedException {
MutableBoolean submitted = new MutableBoolean(false);
int inTransitionCount = submitUnassignProcedure(env, p -> {
submitted.setTrue();
addChildProcedure(p);
});
if (inTransitionCount > 0 && submitted.isFalse()) {
// we haven't scheduled any unassign procedures and there are still regions in
// transition, sleep for a while and try again
if (retryCounter == null) {
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
}
long backoffMillis = retryCounter.getBackoffTimeAndIncrementAttempts();
LOG.info(
"There are still {} region(s) in transition for closing regions of table {}"
+ " when executing {}, suspend {}secs and try again later",
inTransitionCount, tableName, getClass().getSimpleName(), backoffMillis / 1000);
suspend((int) backoffMillis, true);
}
setNextState(getConfirmState());
return Flow.HAS_MORE_STATE;
}

private Flow confirm(MasterProcedureEnv env) {
int unclosedCount = numberOfUnclosedRegions(env);
if (unclosedCount > 0) {
LOG.info(
"There are still {} unclosed region(s) for closing regions of table {}"
+ " when executing {}, continue...",
unclosedCount, tableName, getClass().getSimpleName());
setNextState(getInitialState());
return Flow.HAS_MORE_STATE;
} else {
return Flow.NO_MORE_STATE;
}
}

@Override
protected Flow executeFromState(MasterProcedureEnv env, TState state)
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
LOG.trace("{} execute state={}", this, state);
if (state == getInitialState()) {
return schedule(env);
} else if (state == getConfirmState()) {
return confirm(env);
} else {
throw new UnsupportedOperationException("unhandled state=" + state);
}
}

@Override
protected void rollbackState(MasterProcedureEnv env, TState state)
throws IOException, InterruptedException {
throw new UnsupportedOperationException();
}

@Override
protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
setState(ProcedureProtos.ProcedureState.RUNNABLE);
env.getProcedureScheduler().addFront(this);
return false;
}

/**
* We have two state for this type of procedures, the initial state for scheduling unassign
* procedures, and the confirm state for checking whether we have unassigned all the regions.
* @return the confirm state
*/
protected abstract TState getConfirmState();

/**
* Submit TRSP for unassigning regions. Return the number of regions in RIT state that we can not
* schedule TRSP for them.
*/
protected abstract int submitUnassignProcedure(MasterProcedureEnv env,
Consumer<TransitRegionStateProcedure> submit);

/**
* Return the number of uncloses regions. Returning {@code 0} means we are done.
*/
protected abstract int numberOfUnclosedRegions(MasterProcedureEnv env);
}
Loading

0 comments on commit 634b200

Please sign in to comment.