Skip to content

Commit

Permalink
HBASE-27216 Revisit the ReplicationSyncUp tool
Browse files Browse the repository at this point in the history
  • Loading branch information
Apache9 committed Feb 2, 2023
1 parent 069d1ca commit 55a6256
Show file tree
Hide file tree
Showing 11 changed files with 579 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,7 @@ message ModifyColumnFamilyStoreFileTrackerStateData {
enum AssignReplicationQueuesState {
ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES = 1;
ASSIGN_REPLICATION_QUEUES_CLAIM = 2;
ASSIGN_REPLICATION_QUEUES_REMOVE_QUEUES = 3;
}

message AssignReplicationQueuesStateData {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.apache.hadoop.hbase.replication;

import java.io.IOException;
import java.lang.reflect.Constructor;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Coprocessor;
import org.apache.hadoop.hbase.NamespaceDescriptor;
Expand All @@ -27,20 +28,27 @@
import org.apache.hadoop.hbase.client.CoprocessorDescriptorBuilder;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.util.ReflectionUtils;
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Used to create replication storage(peer, queue) classes.
*/
@InterfaceAudience.Private
public final class ReplicationStorageFactory {

private static final Logger LOG = LoggerFactory.getLogger(ReplicationStorageFactory.class);

public static final String REPLICATION_QUEUE_TABLE_NAME = "hbase.replication.queue.table.name";

public static final TableName REPLICATION_QUEUE_TABLE_NAME_DEFAULT =
TableName.valueOf(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "replication");

public static final String REPLICATION_QUEUE_IMPL = "hbase.replication.queue.impl";

public static TableDescriptor createReplicationQueueTableDescriptor(TableName tableName)
throws IOException {
return TableDescriptorBuilder.newBuilder(tableName)
Expand Down Expand Up @@ -72,15 +80,26 @@ public static ReplicationPeerStorage getReplicationPeerStorage(ZKWatcher zk, Con
*/
public static ReplicationQueueStorage getReplicationQueueStorage(Connection conn,
Configuration conf) {
return getReplicationQueueStorage(conn, TableName.valueOf(conf.get(REPLICATION_QUEUE_TABLE_NAME,
REPLICATION_QUEUE_TABLE_NAME_DEFAULT.getNameAsString())));
return getReplicationQueueStorage(conn, conf, TableName.valueOf(conf
.get(REPLICATION_QUEUE_TABLE_NAME, REPLICATION_QUEUE_TABLE_NAME_DEFAULT.getNameAsString())));
}

/**
* Create a new {@link ReplicationQueueStorage}.
*/
public static ReplicationQueueStorage getReplicationQueueStorage(Connection conn,
TableName tableName) {
return new TableReplicationQueueStorage(conn, tableName);
Configuration conf, TableName tableName) {
Class<? extends ReplicationQueueStorage> clazz = conf.getClass(REPLICATION_QUEUE_IMPL,
TableReplicationQueueStorage.class, ReplicationQueueStorage.class);
try {
Constructor<? extends ReplicationQueueStorage> c =
clazz.getConstructor(Connection.class, TableName.class);
return c.newInstance(conn, tableName);
} catch (Exception e) {
LOG.debug(
"failed to create ReplicationQueueStorage with Connection, try creating with Configuration",
e);
return ReflectionUtils.newInstance(clazz, conf, tableName);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.MasterFileSystem;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface;
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
Expand All @@ -37,6 +39,7 @@
import org.apache.hadoop.hbase.replication.ReplicationPeerDescription;
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
import org.apache.hadoop.hbase.replication.regionserver.ReplicationSyncUp;
import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
Expand Down Expand Up @@ -102,7 +105,7 @@ private void addMissingQueues(MasterProcedureEnv env) throws ReplicationExceptio
}
}

private Flow claimQueues(MasterProcedureEnv env) throws ReplicationException {
private Flow claimQueues(MasterProcedureEnv env) throws ReplicationException, IOException {
Set<String> existingPeerIds = env.getReplicationPeerManager().listPeers(null).stream()
.map(ReplicationPeerDescription::getPeerId).collect(Collectors.toSet());
ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage();
Expand Down Expand Up @@ -130,18 +133,51 @@ private Flow claimQueues(MasterProcedureEnv env) throws ReplicationException {
return Flow.HAS_MORE_STATE;
}

// check whether ReplicationSyncUp has already done the work for us, if so, we should skip
// claiming the replication queues and deleting them instead.
private boolean shouldSkip(MasterProcedureEnv env) throws IOException {
MasterFileSystem mfs = env.getMasterFileSystem();
Path syncUpDir = new Path(mfs.getRootDir(), ReplicationSyncUp.INFO_DIR);
return mfs.getFileSystem().exists(new Path(syncUpDir, crashedServer.getServerName()));
}

private void removeQueues(MasterProcedureEnv env) throws ReplicationException, IOException {
ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage();
for (ReplicationQueueId queueId : storage.listAllQueueIds(crashedServer)) {
storage.removeQueue(queueId);
}
MasterFileSystem mfs = env.getMasterFileSystem();
Path syncUpDir = new Path(mfs.getRootDir(), ReplicationSyncUp.INFO_DIR);
// remove the region server record file
mfs.getFileSystem().delete(new Path(syncUpDir, crashedServer.getServerName()), false);
}

@Override
protected Flow executeFromState(MasterProcedureEnv env, AssignReplicationQueuesState state)
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
try {
switch (state) {
case ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES:
addMissingQueues(env);
retryCounter = null;
setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_CLAIM);
return Flow.HAS_MORE_STATE;
if (shouldSkip(env)) {
setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_REMOVE_QUEUES);
return Flow.HAS_MORE_STATE;
} else {
addMissingQueues(env);
retryCounter = null;
setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_CLAIM);
return Flow.HAS_MORE_STATE;
}
case ASSIGN_REPLICATION_QUEUES_CLAIM:
return claimQueues(env);
if (shouldSkip(env)) {
retryCounter = null;
setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_REMOVE_QUEUES);
return Flow.HAS_MORE_STATE;
} else {
return claimQueues(env);
}
case ASSIGN_REPLICATION_QUEUES_REMOVE_QUEUES:
removeQueues(env);
return Flow.NO_MORE_STATE;
default:
throw new UnsupportedOperationException("unhandled state=" + state);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,22 @@

import java.io.IOException;
import java.util.Optional;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.MasterFileSystem;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.ServerOperation;
import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface;
import org.apache.hadoop.hbase.master.procedure.ServerRemoteProcedure;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation;
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure;
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
import org.apache.hadoop.hbase.replication.regionserver.ClaimReplicationQueueCallable;
import org.apache.hadoop.hbase.replication.regionserver.ReplicationSyncUp;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -54,6 +60,32 @@ public ClaimReplicationQueueRemoteProcedure(ReplicationQueueId queueId, ServerNa
this.targetServer = targetServer;
}

// check whether ReplicationSyncUp has already done the work for us, if so, we should skip
// claiming the replication queues and deleting them instead.
private boolean shouldSkip(MasterProcedureEnv env) throws IOException {
MasterFileSystem mfs = env.getMasterFileSystem();
Path syncUpDir = new Path(mfs.getRootDir(), ReplicationSyncUp.INFO_DIR);
return mfs.getFileSystem().exists(new Path(syncUpDir, getServerName().getServerName()));
}

@Override
protected synchronized Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
try {
if (shouldSkip(env)) {
LOG.info("Skip claiming {} because replication sync up has already done it for us",
getServerName());
return null;
}
} catch (IOException e) {
LOG.warn("failed to check whether we should skip claiming {} due to replication sync up",
getServerName(), e);
// just finish the procedure here, as the AssignReplicationQueuesProcedure will reschedule
return null;
}
return super.execute(env);
}

@Override
public Optional<RemoteOperation> remoteCallBuild(MasterProcedureEnv env, ServerName remote) {
assert targetServer.equals(remote);
Expand Down
Loading

0 comments on commit 55a6256

Please sign in to comment.