From df57e51aefc9a5c283c320aeabbb2e924f81670e Mon Sep 17 00:00:00 2001 From: chaijunjie0101 <64140218+chaijunjie0101@users.noreply.github.com> Date: Sun, 21 Apr 2024 19:39:32 +0800 Subject: [PATCH] HBASE-28150 CreateTableProcedure and DeleteTableProcedure should sleep a while before retrying (#5502) Signed-off-by: Duo Zhang --- .../procedure/CreateTableProcedure.java | 24 ++++- .../procedure/DeleteTableProcedure.java | 24 ++++- ...BadMasterObserverForCreateDeleteTable.java | 55 ++++++++++++ ...stCreateDeleteTableProcedureWithRetry.java | 88 +++++++++++++++++++ .../procedure/TestCreateTableProcedure.java | 4 +- 5 files changed, 190 insertions(+), 5 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/coprocessor/BadMasterObserverForCreateDeleteTable.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateDeleteTableProcedureWithRetry.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java index 17998fec7bd7..23ad3b42aef0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java @@ -35,12 +35,15 @@ import org.apache.hadoop.hbase.master.MasterCoprocessorHost; import org.apache.hadoop.hbase.master.MasterFileSystem; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; +import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; +import org.apache.hadoop.hbase.procedure2.ProcedureUtil; import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerFactory; import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerValidationUtils; import org.apache.hadoop.hbase.rsgroup.RSGroupInfo; import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.util.FSTableDescriptors; import org.apache.hadoop.hbase.util.ModifyRegionUtils; +import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -51,6 +54,7 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.CreateTableState; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; @InterfaceAudience.Private public class CreateTableProcedure extends AbstractStateMachineTableProcedure { @@ -60,6 +64,7 @@ public class CreateTableProcedure extends AbstractStateMachineTableProcedure newRegions; + private RetryCounter retryCounter; public CreateTableProcedure() { // Required by the Procedure framework to create the procedure on replay @@ -80,7 +85,7 @@ public CreateTableProcedure(final MasterProcedureEnv env, final TableDescriptor @Override protected Flow executeFromState(final MasterProcedureEnv env, final CreateTableState state) - throws InterruptedException { + throws InterruptedException, ProcedureSuspendedException { LOG.info("{} execute state={}", this, state); try { switch (state) { @@ -131,6 +136,7 @@ protected Flow executeFromState(final MasterProcedureEnv env, final CreateTableS break; case CREATE_TABLE_POST_OPERATION: postCreate(env); + retryCounter = null; return Flow.NO_MORE_STATE; default: throw new UnsupportedOperationException("unhandled state=" + state); @@ -139,12 +145,26 @@ protected Flow executeFromState(final MasterProcedureEnv env, final CreateTableS if (isRollbackSupported(state)) { setFailure("master-create-table", e); } else { - LOG.warn("Retriable error trying to create table=" + getTableName() + " state=" + state, e); + if (retryCounter == null) { + retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration()); + } + long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); + LOG.warn("Retriable error trying to create table={},state={},suspend {}secs.", + getTableName(), state, backoff / 1000, e); + throw suspend(Math.toIntExact(backoff), true); } } + retryCounter = null; return Flow.HAS_MORE_STATE; } + @Override + protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) { + setState(ProcedureProtos.ProcedureState.RUNNABLE); + env.getProcedureScheduler().addFront(this); + return false; + } + @Override protected void rollbackState(final MasterProcedureEnv env, final CreateTableState state) throws IOException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java index 80fb5d0534d4..8c2f1067c952 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java @@ -41,9 +41,12 @@ import org.apache.hadoop.hbase.mob.MobConstants; import org.apache.hadoop.hbase.mob.MobUtils; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; +import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; +import org.apache.hadoop.hbase.procedure2.ProcedureUtil; import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -54,6 +57,7 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.DeleteTableState; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; @InterfaceAudience.Private public class DeleteTableProcedure extends AbstractStateMachineTableProcedure { @@ -61,6 +65,7 @@ public class DeleteTableProcedure extends AbstractStateMachineTableProcedure regions; private TableName tableName; + private RetryCounter retryCounter; public DeleteTableProcedure() { // Required by the Procedure framework to create the procedure on replay @@ -79,7 +84,7 @@ public DeleteTableProcedure(final MasterProcedureEnv env, final TableName tableN @Override protected Flow executeFromState(final MasterProcedureEnv env, DeleteTableState state) - throws InterruptedException { + throws InterruptedException, ProcedureSuspendedException { if (LOG.isTraceEnabled()) { LOG.trace(this + " execute state=" + state); } @@ -124,6 +129,7 @@ protected Flow executeFromState(final MasterProcedureEnv env, DeleteTableState s break; case DELETE_TABLE_POST_OPERATION: postDelete(env); + retryCounter = null; LOG.debug("Finished {}", this); return Flow.NO_MORE_STATE; default: @@ -133,12 +139,26 @@ protected Flow executeFromState(final MasterProcedureEnv env, DeleteTableState s if (isRollbackSupported(state)) { setFailure("master-delete-table", e); } else { - LOG.warn("Retriable error trying to delete table=" + getTableName() + " state=" + state, e); + if (retryCounter == null) { + retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration()); + } + long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); + LOG.warn("Retriable error trying to delete table={},state={},suspend {}secs.", + getTableName(), state, backoff / 1000, e); + throw suspend(Math.toIntExact(backoff), true); } } + retryCounter = null; return Flow.HAS_MORE_STATE; } + @Override + protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) { + setState(ProcedureProtos.ProcedureState.RUNNABLE); + env.getProcedureScheduler().addFront(this); + return false; + } + @Override protected boolean abort(MasterProcedureEnv env) { // TODO: Current behavior is: with no rollback and no abort support, procedure may get stuck diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/coprocessor/BadMasterObserverForCreateDeleteTable.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/coprocessor/BadMasterObserverForCreateDeleteTable.java new file mode 100644 index 000000000000..454a24e198aa --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/coprocessor/BadMasterObserverForCreateDeleteTable.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.coprocessor; + +import java.io.IOException; +import java.util.Optional; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.TableDescriptor; + +/** + * A bad Master Observer to prevent user to create/delete table once. + */ +public class BadMasterObserverForCreateDeleteTable implements MasterObserver, MasterCoprocessor { + private boolean createFailedOnce = false; + private boolean deleteFailedOnce = false; + + @Override + public void postCompletedCreateTableAction(ObserverContext ctx, + TableDescriptor desc, RegionInfo[] regions) throws IOException { + if (!createFailedOnce && !desc.getTableName().isSystemTable()) { + createFailedOnce = true; + throw new IOException("execute postCompletedCreateTableAction failed once."); + } + } + + @Override + public void postCompletedDeleteTableAction(ObserverContext ctx, + TableName tableName) throws IOException { + if (!deleteFailedOnce && !tableName.isSystemTable()) { + deleteFailedOnce = true; + throw new IOException("execute postCompletedDeleteTableAction failed once."); + } + } + + @Override + public Optional getMasterObserver() { + return Optional.of(this); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateDeleteTableProcedureWithRetry.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateDeleteTableProcedureWithRetry.java new file mode 100644 index 000000000000..3491aa639844 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateDeleteTableProcedureWithRetry.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.procedure; + +import java.io.IOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.coprocessor.BadMasterObserverForCreateDeleteTable; +import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.ModifyRegionUtils; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category({ MasterTests.class, MediumTests.class }) +public class TestCreateDeleteTableProcedureWithRetry { + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestCreateDeleteTableProcedureWithRetry.class); + + private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); + + private static final TableName TABLE_NAME = + TableName.valueOf(TestCreateDeleteTableProcedureWithRetry.class.getSimpleName()); + + private static final String CF = "cf"; + + @BeforeClass + public static void setUp() throws Exception { + Configuration conf = UTIL.getConfiguration(); + conf.set(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY, + BadMasterObserverForCreateDeleteTable.class.getName()); + UTIL.startMiniCluster(1); + } + + @AfterClass + public static void tearDown() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void testCreateDeleteTableRetry() throws IOException { + ProcedureExecutor procExec = + UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); + TableDescriptor htd = MasterProcedureTestingUtility.createHTD(TABLE_NAME, CF); + RegionInfo[] regions = ModifyRegionUtils.createRegionInfos(htd, null); + CreateTableProcedure createProc = + new CreateTableProcedure(procExec.getEnvironment(), htd, regions); + ProcedureTestingUtility.submitAndWait(procExec, createProc); + Assert.assertTrue(UTIL.getAdmin().tableExists(TABLE_NAME)); + MasterProcedureTestingUtility.validateTableCreation(UTIL.getMiniHBaseCluster().getMaster(), + TABLE_NAME, regions, CF); + + UTIL.getAdmin().disableTable(TABLE_NAME); + DeleteTableProcedure deleteProc = + new DeleteTableProcedure(procExec.getEnvironment(), TABLE_NAME); + ProcedureTestingUtility.submitAndWait(procExec, deleteProc); + Assert.assertFalse(UTIL.getAdmin().tableExists(TABLE_NAME)); + MasterProcedureTestingUtility.validateTableDeletion(UTIL.getMiniHBaseCluster().getMaster(), + TABLE_NAME); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateTableProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateTableProcedure.java index bed41f4da86c..0bb54cc190e3 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateTableProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateTableProcedure.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hbase.master.MasterFileSystem; import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerFactory; import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerForTest; @@ -244,7 +245,8 @@ public CreateTableProcedureOnHDFSFailure(final MasterProcedureEnv env, @Override protected Flow executeFromState(MasterProcedureEnv env, - MasterProcedureProtos.CreateTableState state) throws InterruptedException { + MasterProcedureProtos.CreateTableState state) + throws InterruptedException, ProcedureSuspendedException { if ( !failOnce && state == MasterProcedureProtos.CreateTableState.CREATE_TABLE_WRITE_FS_LAYOUT