Skip to content

Commit

Permalink
HBASE-22414 Interruption of moving regions in RSGroup will cause regi…
Browse files Browse the repository at this point in the history
…ons on wrong rs

Signed-off-by: Wellington Chevreuil <[email protected]>
  • Loading branch information
sunhelly authored and wchevreuil committed Jul 15, 2019
1 parent 380f80f commit 74731c2
Show file tree
Hide file tree
Showing 3 changed files with 315 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.NamespaceDescriptor;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
Expand All @@ -54,15 +57,27 @@
@InterfaceAudience.Private
public class RSGroupAdminServer implements RSGroupAdmin {
private static final Logger LOG = LoggerFactory.getLogger(RSGroupAdminServer.class);
public static final String KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE = "should keep at least " +
static final String KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE = "should keep at least " +
"one server in 'default' RSGroup.";

private MasterServices master;
private final RSGroupInfoManager rsGroupInfoManager;

/** Define the config key of retries threshold when movements failed */
//made package private for testing
static final String FAILED_MOVE_MAX_RETRY = "hbase.rsgroup.move.max.retry";

/** Define the default number of retries */
//made package private for testing
static final int DEFAULT_MAX_RETRY_VALUE = 50;

private int moveMaxRetry;

public RSGroupAdminServer(MasterServices master, RSGroupInfoManager rsGroupInfoManager) {
this.master = master;
this.rsGroupInfoManager = rsGroupInfoManager;
this.moveMaxRetry = master.getConfiguration().getInt(FAILED_MOVE_MAX_RETRY,
DEFAULT_MAX_RETRY_VALUE);
}

@Override
Expand Down Expand Up @@ -202,20 +217,77 @@ private void checkServersAndTables(Set<Address> servers, Set<TableName> tables,
* @throws IOException if moving the server and tables fail
*/
private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroupName)
throws IOException {
throws IOException {
moveRegionsBetweenGroups(servers, targetGroupName,
rs -> getRegions(rs),
info -> {
try {
RSGroupInfo group = getRSGroupInfo(targetGroupName);
return group.containsTable(info.getTable());
} catch (IOException e) {
e.printStackTrace();
return false;
}
},
rs -> rs.getHostname());
}

/**
* Moves regions of tables which are not on target group servers.
*
* @param tables the tables that will move to new group
* @param targetGroupName the target group name
* @throws IOException if moving the region fails
*/
private void moveTableRegionsToGroup(Set<TableName> tables, String targetGroupName)
throws IOException {
moveRegionsBetweenGroups(tables, targetGroupName,
table -> {
if (master.getAssignmentManager().isTableDisabled(table)) {
return new ArrayList<>();
}
return master.getAssignmentManager().getRegionStates().getRegionsOfTable(table);
},
info -> {
try {
RSGroupInfo group = getRSGroupInfo(targetGroupName);
ServerName sn =
master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(info);
return group.containsServer(sn.getAddress());
} catch (IOException e) {
e.printStackTrace();
return false;
}
},
table -> table.getNameWithNamespaceInclAsString());
}

private <T> void moveRegionsBetweenGroups(Set<T> regionsOwners, String targetGroupName,
Function<T, List<RegionInfo>> getRegionsInfo, Function<RegionInfo, Boolean> validation,
Function<T, String> getOwnerName) throws IOException {
boolean hasRegionsToMove;
RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName);
Set<Address> allSevers = new HashSet<>(servers);
int retry = 0;
Set<T> allOwners = new HashSet<>(regionsOwners);
Set<String> failedRegions = new HashSet<>();
IOException toThrow = null;
do {
hasRegionsToMove = false;
for (Iterator<Address> iter = allSevers.iterator(); iter.hasNext();) {
Address rs = iter.next();
for (Iterator<T> iter = allOwners.iterator(); iter.hasNext(); ) {
T owner = iter.next();
// Get regions that are associated with this server and filter regions by group tables.
for (RegionInfo region : getRegions(rs)) {
if (!targetGrp.containsTable(region.getTable())) {
LOG.info("Moving server region {}, which do not belong to RSGroup {}",
for (RegionInfo region : getRegionsInfo.apply(owner)) {
if (!validation.apply(region)) {
LOG.info("Moving region {}, which do not belong to RSGroup {}",
region.getShortNameToLog(), targetGroupName);
this.master.getAssignmentManager().move(region);
try {
this.master.getAssignmentManager().move(region);
failedRegions.remove(region.getRegionNameAsString());
} catch (IOException ioe) {
LOG.debug("Move region {} from group failed, will retry, current retry time is {}",
region.getShortNameToLog(), retry, ioe);
toThrow = ioe;
failedRegions.add(region.getRegionNameAsString());
}
if (master.getAssignmentManager().getRegionStates().
getRegionState(region).isFailedOpen()) {
continue;
Expand All @@ -225,44 +297,30 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup
}

if (!hasRegionsToMove) {
LOG.info("Server {} has no more regions to move for RSGroup", rs.getHostname());
LOG.info("No more regions to move from {} to RSGroup", getOwnerName.apply(owner));
iter.remove();
}
}

retry++;
try {
rsGroupInfoManager.wait(1000);
} catch (InterruptedException e) {
LOG.warn("Sleep interrupted", e);
Thread.currentThread().interrupt();
}
} while (hasRegionsToMove);
}

/**
* Moves regions of tables which are not on target group servers.
*
* @param tables the tables that will move to new group
* @param targetGroupName the target group name
* @throws IOException if moving the region fails
*/
private void moveTableRegionsToGroup(Set<TableName> tables, String targetGroupName)
throws IOException {
RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName);
for (TableName table : tables) {
if (master.getAssignmentManager().isTableDisabled(table)) {
LOG.debug("Skipping move regions because the table {} is disabled", table);
continue;
}
LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName);
for (RegionInfo region : master.getAssignmentManager().getRegionStates()
.getRegionsOfTable(table)) {
ServerName sn =
master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region);
if (!targetGrp.containsServer(sn.getAddress())) {
LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName);
master.getAssignmentManager().move(region);
}
}
} while (hasRegionsToMove && retry <= moveMaxRetry);

//has up to max retry time or there are no more regions to move
if (hasRegionsToMove) {
// print failed moved regions, for later process conveniently
String msg = String
.format("move regions for group %s failed, failed regions: %s", targetGroupName,
failedRegions);
LOG.error(msg);
throw new DoNotRetryIOException(
msg + ", just record the last failed region's cause, more details in server log",
toThrow);
}
}

Expand Down
Loading

0 comments on commit 74731c2

Please sign in to comment.