Skip to content

Commit

Permalink
HBASE-28150 CreateTableProcedure and DeleteTableProcedure should slee…
Browse files Browse the repository at this point in the history
…p a while before retrying (#5842)

Conflict: Copy suspend method from HBASE-27218, it is a great method to process suspend, no need write same code again into branch2, so if we want backport HBASE-27218, could skip this method.

Signed-off-by: Duo Zhang <[email protected]>
(cherry picked from commit a36da5e)
  • Loading branch information
chaijunjie0101 authored and Apache9 committed Apr 22, 2024
1 parent 1159ba5 commit 1c05099
Show file tree
Hide file tree
Showing 6 changed files with 205 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ThreadLocalRandom;
import org.apache.hadoop.hbase.exceptions.TimeoutIOException;
import org.apache.hadoop.hbase.metrics.Counter;
import org.apache.hadoop.hbase.metrics.Histogram;
Expand All @@ -33,6 +34,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState;

/**
Expand Down Expand Up @@ -1034,6 +1036,19 @@ final void doReleaseLock(TEnvironment env, ProcedureStore store) {
releaseLock(env);
}

protected final ProcedureSuspendedException suspend(int timeoutMillis, boolean jitter)
throws ProcedureSuspendedException {
if (jitter) {
// 10% possible jitter
double add = (double) timeoutMillis * ThreadLocalRandom.current().nextDouble(0.1);
timeoutMillis += add;
}
setTimeout(timeoutMillis);
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
skipPersistence();
throw new ProcedureSuspendedException();
}

@Override
public int compareTo(final Procedure<TEnvironment> other) {
return Long.compare(getProcId(), other.getProcId());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@
import org.apache.hadoop.hbase.master.MasterCoprocessorHost;
import org.apache.hadoop.hbase.master.MasterFileSystem;
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerFactory;
import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerValidationUtils;
import org.apache.hadoop.hbase.replication.ReplicationException;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.util.FSTableDescriptors;
import org.apache.hadoop.hbase.util.ModifyRegionUtils;
import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
Expand All @@ -51,6 +54,7 @@
import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.CreateTableState;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;

@InterfaceAudience.Private
public class CreateTableProcedure extends AbstractStateMachineTableProcedure<CreateTableState> {
Expand All @@ -60,6 +64,7 @@ public class CreateTableProcedure extends AbstractStateMachineTableProcedure<Cre

private TableDescriptor tableDescriptor;
private List<RegionInfo> newRegions;
private RetryCounter retryCounter;

public CreateTableProcedure() {
// Required by the Procedure framework to create the procedure on replay
Expand All @@ -80,7 +85,7 @@ public CreateTableProcedure(final MasterProcedureEnv env, final TableDescriptor

@Override
protected Flow executeFromState(final MasterProcedureEnv env, final CreateTableState state)
throws InterruptedException {
throws InterruptedException, ProcedureSuspendedException {
LOG.info("{} execute state={}", this, state);
try {
switch (state) {
Expand Down Expand Up @@ -131,6 +136,7 @@ protected Flow executeFromState(final MasterProcedureEnv env, final CreateTableS
break;
case CREATE_TABLE_POST_OPERATION:
postCreate(env);
retryCounter = null;
return Flow.NO_MORE_STATE;
default:
throw new UnsupportedOperationException("unhandled state=" + state);
Expand All @@ -139,12 +145,26 @@ protected Flow executeFromState(final MasterProcedureEnv env, final CreateTableS
if (isRollbackSupported(state)) {
setFailure("master-create-table", e);
} else {
LOG.warn("Retriable error trying to create table=" + getTableName() + " state=" + state, e);
if (retryCounter == null) {
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
}
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
LOG.warn("Retriable error trying to create table={},state={},suspend {}secs.",
getTableName(), state, backoff / 1000, e);
throw suspend(Math.toIntExact(backoff), true);
}
}
retryCounter = null;
return Flow.HAS_MORE_STATE;
}

@Override
protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
setState(ProcedureProtos.ProcedureState.RUNNABLE);
env.getProcedureScheduler().addFront(this);
return false;
}

@Override
protected void rollbackState(final MasterProcedureEnv env, final CreateTableState state)
throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@
import org.apache.hadoop.hbase.mob.MobConstants;
import org.apache.hadoop.hbase.mob.MobUtils;
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -54,13 +57,15 @@
import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.DeleteTableState;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;

@InterfaceAudience.Private
public class DeleteTableProcedure extends AbstractStateMachineTableProcedure<DeleteTableState> {
private static final Logger LOG = LoggerFactory.getLogger(DeleteTableProcedure.class);

private List<RegionInfo> regions;
private TableName tableName;
private RetryCounter retryCounter;

public DeleteTableProcedure() {
// Required by the Procedure framework to create the procedure on replay
Expand All @@ -79,7 +84,7 @@ public DeleteTableProcedure(final MasterProcedureEnv env, final TableName tableN

@Override
protected Flow executeFromState(final MasterProcedureEnv env, DeleteTableState state)
throws InterruptedException {
throws InterruptedException, ProcedureSuspendedException {
if (LOG.isTraceEnabled()) {
LOG.trace(this + " execute state=" + state);
}
Expand Down Expand Up @@ -124,6 +129,7 @@ protected Flow executeFromState(final MasterProcedureEnv env, DeleteTableState s
break;
case DELETE_TABLE_POST_OPERATION:
postDelete(env);
retryCounter = null;
LOG.debug("Finished {}", this);
return Flow.NO_MORE_STATE;
default:
Expand All @@ -133,12 +139,26 @@ protected Flow executeFromState(final MasterProcedureEnv env, DeleteTableState s
if (isRollbackSupported(state)) {
setFailure("master-delete-table", e);
} else {
LOG.warn("Retriable error trying to delete table=" + getTableName() + " state=" + state, e);
if (retryCounter == null) {
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
}
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
LOG.warn("Retriable error trying to delete table={},state={},suspend {}secs.",
getTableName(), state, backoff / 1000, e);
throw suspend(Math.toIntExact(backoff), true);
}
}
retryCounter = null;
return Flow.HAS_MORE_STATE;
}

@Override
protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
setState(ProcedureProtos.ProcedureState.RUNNABLE);
env.getProcedureScheduler().addFront(this);
return false;
}

@Override
protected boolean abort(MasterProcedureEnv env) {
// TODO: Current behavior is: with no rollback and no abort support, procedure may get stuck
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.coprocessor;

import java.io.IOException;
import java.util.Optional;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.TableDescriptor;

/**
* A bad Master Observer to prevent user to create/delete table once.
*/
public class BadMasterObserverForCreateDeleteTable implements MasterObserver, MasterCoprocessor {
private boolean createFailedOnce = false;
private boolean deleteFailedOnce = false;

@Override
public void postCompletedCreateTableAction(ObserverContext<MasterCoprocessorEnvironment> ctx,
TableDescriptor desc, RegionInfo[] regions) throws IOException {
if (!createFailedOnce && !desc.getTableName().isSystemTable()) {
createFailedOnce = true;
throw new IOException("execute postCompletedCreateTableAction failed once.");
}
}

@Override
public void postCompletedDeleteTableAction(ObserverContext<MasterCoprocessorEnvironment> ctx,
TableName tableName) throws IOException {
if (!deleteFailedOnce && !tableName.isSystemTable()) {
deleteFailedOnce = true;
throw new IOException("execute postCompletedDeleteTableAction failed once.");
}
}

@Override
public Optional<MasterObserver> getMasterObserver() {
return Optional.of(this);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master.procedure;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.coprocessor.BadMasterObserverForCreateDeleteTable;
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.ModifyRegionUtils;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.experimental.categories.Category;

@Category({ MasterTests.class, MediumTests.class })
public class TestCreateDeleteTableProcedureWithRetry {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestCreateDeleteTableProcedureWithRetry.class);

private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();

private static final TableName TABLE_NAME =
TableName.valueOf(TestCreateDeleteTableProcedureWithRetry.class.getSimpleName());

private static final String CF = "cf";

@BeforeClass
public static void setUp() throws Exception {
Configuration conf = UTIL.getConfiguration();
conf.set(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY,
BadMasterObserverForCreateDeleteTable.class.getName());
UTIL.startMiniCluster(1);
}

@AfterClass
public static void tearDown() throws Exception {
UTIL.shutdownMiniCluster();
}

@Test
public void testCreateDeleteTableRetry() throws IOException {
ProcedureExecutor<MasterProcedureEnv> procExec =
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
TableDescriptor htd = MasterProcedureTestingUtility.createHTD(TABLE_NAME, CF);
RegionInfo[] regions = ModifyRegionUtils.createRegionInfos(htd, null);
CreateTableProcedure createProc =
new CreateTableProcedure(procExec.getEnvironment(), htd, regions);
ProcedureTestingUtility.submitAndWait(procExec, createProc);
Assert.assertTrue(UTIL.getAdmin().tableExists(TABLE_NAME));
MasterProcedureTestingUtility.validateTableCreation(UTIL.getMiniHBaseCluster().getMaster(),
TABLE_NAME, regions, CF);

UTIL.getAdmin().disableTable(TABLE_NAME);
DeleteTableProcedure deleteProc =
new DeleteTableProcedure(procExec.getEnvironment(), TABLE_NAME);
ProcedureTestingUtility.submitAndWait(procExec, deleteProc);
Assert.assertFalse(UTIL.getAdmin().tableExists(TABLE_NAME));
MasterProcedureTestingUtility.validateTableDeletion(UTIL.getMiniHBaseCluster().getMaster(),
TABLE_NAME);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import org.apache.hadoop.hbase.master.MasterFileSystem;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerFactory;
import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerForTest;
Expand Down Expand Up @@ -244,7 +245,8 @@ public CreateTableProcedureOnHDFSFailure(final MasterProcedureEnv env,

@Override
protected Flow executeFromState(MasterProcedureEnv env,
MasterProcedureProtos.CreateTableState state) throws InterruptedException {
MasterProcedureProtos.CreateTableState state)
throws InterruptedException, ProcedureSuspendedException {

if (
!failOnce && state == MasterProcedureProtos.CreateTableState.CREATE_TABLE_WRITE_FS_LAYOUT
Expand Down

0 comments on commit 1c05099

Please sign in to comment.