From 2f30fec71de3a872580ee34bb62f17d7c60ee61b Mon Sep 17 00:00:00 2001 From: Ray Mattingly Date: Thu, 9 Nov 2023 15:38:45 -0500 Subject: [PATCH 1/9] rate limit reopen proc --- .../procedure/ModifyTableProcedure.java | 13 +- .../ReopenTableRegionsProcedure.java | 68 ++++++- ...openTableRegionsProcedureBatchBackoff.java | 103 +++++++++++ ...stReopenTableRegionsProcedureBatching.java | 169 ++++++++++++++++++ 4 files changed, 343 insertions(+), 10 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatchBackoff.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java index d7377adf3a10..9a1a857b4c9d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java @@ -17,6 +17,11 @@ */ package org.apache.hadoop.hbase.master.procedure; +import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT; +import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_BACKOFF_MILLIS_KEY; +import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_SIZE_DEFAULT; +import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_SIZE_KEY; + import java.io.IOException; import java.util.Arrays; import java.util.Collections; @@ -25,6 +30,7 @@ import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.IntStream; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ConcurrentTableModificationException; import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.HBaseIOException; @@ -147,7 +153,12 @@ protected Flow executeFromState(final MasterProcedureEnv env, final ModifyTableS break; case MODIFY_TABLE_REOPEN_ALL_REGIONS: if (isTableEnabled(env)) { - addChildProcedure(new ReopenTableRegionsProcedure(getTableName())); + Configuration conf = env.getMasterConfiguration(); + long backoffMillis = + conf.getLong(REOPEN_BATCH_BACKOFF_MILLIS_KEY, REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT); + int batchSize = conf.getInt(REOPEN_BATCH_SIZE_KEY, REOPEN_BATCH_SIZE_DEFAULT); + addChildProcedure( + new ReopenTableRegionsProcedure(getTableName(), backoffMillis, batchSize)); } setNextState(ModifyTableState.MODIFY_TABLE_ASSIGN_NEW_REPLICAS); break; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index 4efb1768b0ce..1e9ca81641f4 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -17,10 +17,12 @@ */ package org.apache.hadoop.hbase.master.procedure; +import com.google.errorprone.annotations.RestrictedApi; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.TableName; @@ -53,6 +55,12 @@ public class ReopenTableRegionsProcedure private static final Logger LOG = LoggerFactory.getLogger(ReopenTableRegionsProcedure.class); + public static final String REOPEN_BATCH_BACKOFF_MILLIS_KEY = + "hbase.table.regions.reopen.batch.backoff.ms"; + public static final long REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT = 0L; + public static final String REOPEN_BATCH_SIZE_KEY = "hbase.table.regions.reopen.batch.size"; + public static final int REOPEN_BATCH_SIZE_DEFAULT = Integer.MAX_VALUE; + private TableName tableName; // Specify specific regions of a table to reopen. @@ -61,20 +69,36 @@ public class ReopenTableRegionsProcedure private List regions = Collections.emptyList(); + private List currentRegionBatch = Collections.emptyList(); + private RetryCounter retryCounter; + private final long reopenBatchBackoffMillis; + private final int reopenBatchSize; + public ReopenTableRegionsProcedure() { - regionNames = Collections.emptyList(); + this(null); } public ReopenTableRegionsProcedure(TableName tableName) { - this.tableName = tableName; - this.regionNames = Collections.emptyList(); + this(tableName, Collections.emptyList()); } public ReopenTableRegionsProcedure(final TableName tableName, final List regionNames) { + this(tableName, regionNames, REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT, REOPEN_BATCH_SIZE_DEFAULT); + } + + public ReopenTableRegionsProcedure(final TableName tableName, long reopenBatchBackoffMillis, + int reopenBatchSize) { + this(tableName, Collections.emptyList(), reopenBatchBackoffMillis, reopenBatchSize); + } + + public ReopenTableRegionsProcedure(final TableName tableName, final List regionNames, + long reopenBatchBackoffMillis, int reopenBatchSize) { this.tableName = tableName; this.regionNames = regionNames; + this.reopenBatchBackoffMillis = reopenBatchBackoffMillis; + this.reopenBatchSize = reopenBatchSize; } @Override @@ -87,6 +111,12 @@ public TableOperationType getTableOperationType() { return TableOperationType.REGION_EDIT; } + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + public List getCurrentRegionBatch() { + return new ArrayList<>(currentRegionBatch); + } + private boolean canSchedule(MasterProcedureEnv env, HRegionLocation loc) { if (loc.getSeqNum() < 0) { return false; @@ -114,7 +144,8 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); return Flow.HAS_MORE_STATE; case REOPEN_TABLE_REGIONS_REOPEN_REGIONS: - for (HRegionLocation loc : regions) { + currentRegionBatch = regions.stream().limit(reopenBatchSize).collect(Collectors.toList()); + for (HRegionLocation loc : currentRegionBatch) { RegionStateNode regionNode = env.getAssignmentManager().getRegionStates().getRegionStateNode(loc.getRegion()); // this possible, maybe the region has already been merged or split, see HBASE-20921 @@ -139,11 +170,30 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState case REOPEN_TABLE_REGIONS_CONFIRM_REOPENED: regions = regions.stream().map(env.getAssignmentManager().getRegionStates()::checkReopened) .filter(l -> l != null).collect(Collectors.toList()); - if (regions.isEmpty()) { - return Flow.NO_MORE_STATE; + // we need to create a set of region names because the HRegionLocation hashcode is only + // based + // on the server name + Set currentRegionBatchNames = currentRegionBatch.stream() + .map(r -> r.getRegion().getRegionName()).collect(Collectors.toSet()); + currentRegionBatch = regions.stream() + .filter(r -> currentRegionBatchNames.contains(r.getRegion().getRegionName())) + .collect(Collectors.toList()); + if (currentRegionBatch.isEmpty()) { + if (regions.isEmpty()) { + return Flow.NO_MORE_STATE; + } else { + if (reopenBatchBackoffMillis > 0) { + Thread.sleep(reopenBatchBackoffMillis); + } + setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); + return Flow.HAS_MORE_STATE; + } } - if (regions.stream().anyMatch(loc -> canSchedule(env, loc))) { + if (currentRegionBatch.stream().anyMatch(loc -> canSchedule(env, loc))) { retryCounter = null; + if (reopenBatchBackoffMillis > 0) { + Thread.sleep(reopenBatchBackoffMillis); + } setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); return Flow.HAS_MORE_STATE; } @@ -154,9 +204,9 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState } long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); LOG.info( - "There are still {} region(s) which need to be reopened for table {} are in " + "There are still {} region(s) which need to be reopened for table {}. {} are in " + "OPENING state, suspend {}secs and try again later", - regions.size(), tableName, backoff / 1000); + regions.size(), tableName, currentRegionBatch.size(), backoff / 1000); setTimeout(Math.toIntExact(backoff)); setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); skipPersistence(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatchBackoff.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatchBackoff.java new file mode 100644 index 000000000000..fbabb1fa22cc --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatchBackoff.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.procedure; + +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.time.Duration; +import java.time.Instant; +import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.master.ServerManager; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +/** + * Confirm that we will rate limit reopen batches when reopening all table regions. This can avoid + * the pain associated with reopening too many regions at once. + */ +@Category({ MasterTests.class, MediumTests.class }) +public class TestReopenTableRegionsProcedureBatchBackoff { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestReopenTableRegionsProcedureBatchBackoff.class); + + private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); + + private static TableName TABLE_NAME = TableName.valueOf("BatchBackoff"); + private static final int BACKOFF_MILLIS_PER_RS = 3_000; + private static final int REOPEN_BATCH_SIZE = 1; + + private static byte[] CF = Bytes.toBytes("cf"); + + @BeforeClass + public static void setUp() throws Exception { + Configuration conf = UTIL.getConfiguration(); + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); + UTIL.startMiniCluster(1); + UTIL.createMultiRegionTable(TABLE_NAME, CF, 10); + } + + @AfterClass + public static void tearDown() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void testRegionBatchBackoff() throws IOException { + ProcedureExecutor procExec = + UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); + List regions = UTIL.getAdmin().getRegions(TABLE_NAME); + assertTrue(10 <= regions.size()); + ReopenTableRegionsProcedure proc = + new ReopenTableRegionsProcedure(TABLE_NAME, BACKOFF_MILLIS_PER_RS, REOPEN_BATCH_SIZE); + procExec.submitProcedure(proc); + Instant startedAt = Instant.now(); + ProcedureSyncWait.waitForProcedureToComplete(procExec, proc, 60_000); + Instant stoppedAt = Instant.now(); + assertTrue(Duration.between(startedAt, stoppedAt).toMillis() + > (long) regions.size() * BACKOFF_MILLIS_PER_RS); + } + + @Test + public void testRegionBatchNoBackoff() throws IOException { + ProcedureExecutor procExec = + UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); + List regions = UTIL.getAdmin().getRegions(TABLE_NAME); + assertTrue(10 <= regions.size()); + int noBackoffMillis = 0; + ReopenTableRegionsProcedure proc = + new ReopenTableRegionsProcedure(TABLE_NAME, noBackoffMillis, REOPEN_BATCH_SIZE); + procExec.submitProcedure(proc); + ProcedureSyncWait.waitForProcedureToComplete(procExec, proc, + (long) regions.size() * BACKOFF_MILLIS_PER_RS); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java new file mode 100644 index 000000000000..583b09e9ae48 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.procedure; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.HRegionLocation; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.master.RegionState.State; +import org.apache.hadoop.hbase.master.ServerManager; +import org.apache.hadoop.hbase.master.assignment.AssignmentManager; +import org.apache.hadoop.hbase.master.assignment.RegionStateNode; +import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState; + +/** + * Confirm that we will batch region reopens when reopening all table regions. This can avoid the + * pain associated with reopening too many regions at once. + */ +@Category({ MasterTests.class, MediumTests.class }) +public class TestReopenTableRegionsProcedureBatching { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestReopenTableRegionsProcedureBatching.class); + + private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); + private static final int BACKOFF_MILLIS_PER_RS = 0; + private static final int REOPEN_BATCH_SIZE = 1; + + private static TableName TABLE_NAME = TableName.valueOf("Batching"); + + private static byte[] CF = Bytes.toBytes("cf"); + + @BeforeClass + public static void setUp() throws Exception { + Configuration conf = UTIL.getConfiguration(); + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); + UTIL.startMiniCluster(1); + UTIL.createMultiRegionTable(TABLE_NAME, CF); + } + + @AfterClass + public static void tearDown() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void testRegionBatching() throws IOException { + AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); + ProcedureExecutor procExec = + UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); + List regions = UTIL.getAdmin().getRegions(TABLE_NAME); + assertTrue(2 <= regions.size()); + Set stuckRegions = + regions.stream().map(r -> stickRegion(am, procExec, r)).collect(Collectors.toSet()); + ReopenTableRegionsProcedure proc = + new ReopenTableRegionsProcedure(TABLE_NAME, BACKOFF_MILLIS_PER_RS, REOPEN_BATCH_SIZE); + procExec.submitProcedure(proc); + UTIL.waitFor(10000, () -> proc.getState() == ProcedureState.WAITING_TIMEOUT); + confirmBatchSize(REOPEN_BATCH_SIZE, stuckRegions, proc); + ProcedureSyncWait.waitForProcedureToComplete(procExec, proc, 60_000); + } + + @Test + public void testNoRegionBatching() throws IOException { + AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); + ProcedureExecutor procExec = + UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); + List regions = UTIL.getAdmin().getRegions(TABLE_NAME); + assertTrue(2 <= regions.size()); + Set stuckRegions = + regions.stream().map(r -> stickRegion(am, procExec, r)).collect(Collectors.toSet()); + ReopenTableRegionsProcedure proc = new ReopenTableRegionsProcedure(TABLE_NAME); + procExec.submitProcedure(proc); + UTIL.waitFor(10000, () -> proc.getState() == ProcedureState.WAITING_TIMEOUT); + confirmBatchSize(regions.size(), stuckRegions, proc); + ProcedureSyncWait.waitForProcedureToComplete(procExec, proc, 60_000); + } + + private void confirmBatchSize(int expectedBatchSize, Set stuckRegions, + ReopenTableRegionsProcedure proc) { + while (true) { + List currentRegionBatch = proc.getCurrentRegionBatch(); + if (currentRegionBatch.isEmpty()) { + continue; + } + stuckRegions.forEach(this::unstickRegion); + assertEquals(expectedBatchSize, currentRegionBatch.size()); + break; + } + } + + static class StuckRegion { + final TransitRegionStateProcedure trsp; + final RegionStateNode regionNode; + final long openSeqNum; + + public StuckRegion(TransitRegionStateProcedure trsp, RegionStateNode regionNode, + long openSeqNum) { + this.trsp = trsp; + this.regionNode = regionNode; + this.openSeqNum = openSeqNum; + } + } + + private StuckRegion stickRegion(AssignmentManager am, + ProcedureExecutor procExec, RegionInfo regionInfo) { + RegionStateNode regionNode = am.getRegionStates().getRegionStateNode(regionInfo); + TransitRegionStateProcedure trsp = + TransitRegionStateProcedure.unassign(procExec.getEnvironment(), regionInfo); + regionNode.lock(); + long openSeqNum; + try { + openSeqNum = regionNode.getOpenSeqNum(); + regionNode.setState(State.OPENING); + regionNode.setOpenSeqNum(-1L); + regionNode.setProcedure(trsp); + } finally { + regionNode.unlock(); + } + return new StuckRegion(trsp, regionNode, openSeqNum); + } + + private void unstickRegion(StuckRegion stuckRegion) { + stuckRegion.regionNode.lock(); + try { + stuckRegion.regionNode.setState(State.OPEN); + stuckRegion.regionNode.setOpenSeqNum(stuckRegion.openSeqNum); + stuckRegion.regionNode.unsetProcedure(stuckRegion.trsp); + } finally { + stuckRegion.regionNode.unlock(); + } + } +} From a3a880439fa553f0b96973bb051884c4a4998ea7 Mon Sep 17 00:00:00 2001 From: Ray Mattingly Date: Wed, 22 Nov 2023 11:40:09 -0500 Subject: [PATCH 2/9] use proc timeout rather than sleep --- .../ReopenTableRegionsProcedure.java | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index 1e9ca81641f4..0b9b3c4f8d3d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -182,19 +182,19 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState if (regions.isEmpty()) { return Flow.NO_MORE_STATE; } else { + setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); if (reopenBatchBackoffMillis > 0) { - Thread.sleep(reopenBatchBackoffMillis); + backoff(reopenBatchBackoffMillis); } - setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); return Flow.HAS_MORE_STATE; } } if (currentRegionBatch.stream().anyMatch(loc -> canSchedule(env, loc))) { retryCounter = null; + setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); if (reopenBatchBackoffMillis > 0) { - Thread.sleep(reopenBatchBackoffMillis); + backoff(reopenBatchBackoffMillis); } - setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); return Flow.HAS_MORE_STATE; } // We can not schedule TRSP for all the regions need to reopen, wait for a while and retry @@ -202,20 +202,25 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState if (retryCounter == null) { retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration()); } - long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); + long backoffMillis = retryCounter.getBackoffTimeAndIncrementAttempts(); LOG.info( "There are still {} region(s) which need to be reopened for table {}. {} are in " + "OPENING state, suspend {}secs and try again later", - regions.size(), tableName, currentRegionBatch.size(), backoff / 1000); - setTimeout(Math.toIntExact(backoff)); - setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); - skipPersistence(); + regions.size(), tableName, currentRegionBatch.size(), backoffMillis / 1000); + backoff(backoffMillis); throw new ProcedureSuspendedException(); default: throw new UnsupportedOperationException("unhandled state=" + state); } } + private void backoff(long millis) throws ProcedureSuspendedException { + setTimeout(Math.toIntExact(millis)); + setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); + skipPersistence(); + throw new ProcedureSuspendedException(); + } + private List getRegionLocationsForReopen(List tableRegionsForReopen) { From 32e08b2231cf7f516013603986fae2fb09424131 Mon Sep 17 00:00:00 2001 From: Ray Mattingly Date: Tue, 28 Nov 2023 10:55:57 -0500 Subject: [PATCH 3/9] don't use final --- .../hbase/master/procedure/ReopenTableRegionsProcedure.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index 0b9b3c4f8d3d..d4aa2e48a815 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -73,8 +73,8 @@ public class ReopenTableRegionsProcedure private RetryCounter retryCounter; - private final long reopenBatchBackoffMillis; - private final int reopenBatchSize; + private long reopenBatchBackoffMillis; + private int reopenBatchSize; public ReopenTableRegionsProcedure() { this(null); From 36f7ee1cec807f9fb838d3860c673727fd3f63cb Mon Sep 17 00:00:00 2001 From: Ray Mattingly Date: Wed, 29 Nov 2023 10:08:06 -0500 Subject: [PATCH 4/9] support increasing batch size --- .../procedure/ModifyTableProcedure.java | 9 +-- .../ReopenTableRegionsProcedure.java | 53 ++++++++++++----- ...stReopenTableRegionsProcedureBatching.java | 59 +++++++++++++++---- 3 files changed, 91 insertions(+), 30 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java index 9a1a857b4c9d..ff1f59da38b7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java @@ -19,8 +19,8 @@ import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT; import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_BACKOFF_MILLIS_KEY; -import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_SIZE_DEFAULT; -import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_SIZE_KEY; +import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_SIZE_MAX_DEFAULT; +import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_SIZE_MAX_KEY; import java.io.IOException; import java.util.Arrays; @@ -156,9 +156,10 @@ protected Flow executeFromState(final MasterProcedureEnv env, final ModifyTableS Configuration conf = env.getMasterConfiguration(); long backoffMillis = conf.getLong(REOPEN_BATCH_BACKOFF_MILLIS_KEY, REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT); - int batchSize = conf.getInt(REOPEN_BATCH_SIZE_KEY, REOPEN_BATCH_SIZE_DEFAULT); + int batchSizeMax = + conf.getInt(REOPEN_BATCH_SIZE_MAX_KEY, REOPEN_BATCH_SIZE_MAX_DEFAULT); addChildProcedure( - new ReopenTableRegionsProcedure(getTableName(), backoffMillis, batchSize)); + new ReopenTableRegionsProcedure(getTableName(), backoffMillis, batchSizeMax)); } setNextState(ModifyTableState.MODIFY_TABLE_ASSIGN_NEW_REPLICAS); break; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index d4aa2e48a815..267b521df65f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -58,8 +58,12 @@ public class ReopenTableRegionsProcedure public static final String REOPEN_BATCH_BACKOFF_MILLIS_KEY = "hbase.table.regions.reopen.batch.backoff.ms"; public static final long REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT = 0L; - public static final String REOPEN_BATCH_SIZE_KEY = "hbase.table.regions.reopen.batch.size"; - public static final int REOPEN_BATCH_SIZE_DEFAULT = Integer.MAX_VALUE; + public static final String REOPEN_BATCH_SIZE_MAX_KEY = + "hbase.table.regions.reopen.batch.size.max"; + public static final int REOPEN_BATCH_SIZE_MAX_DEFAULT = Integer.MAX_VALUE; + + // this minimum prevents a max which would break this procedure + private static final int MINIMUM_BATCH_SIZE_MAX = 1; private TableName tableName; @@ -75,6 +79,9 @@ public class ReopenTableRegionsProcedure private long reopenBatchBackoffMillis; private int reopenBatchSize; + private int reopenBatchSizeMax; + private long regionsReopened = 0; + private long batchesProcessed = 0; public ReopenTableRegionsProcedure() { this(null); @@ -85,20 +92,22 @@ public ReopenTableRegionsProcedure(TableName tableName) { } public ReopenTableRegionsProcedure(final TableName tableName, final List regionNames) { - this(tableName, regionNames, REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT, REOPEN_BATCH_SIZE_DEFAULT); + this(tableName, regionNames, REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT, + REOPEN_BATCH_SIZE_MAX_DEFAULT); } public ReopenTableRegionsProcedure(final TableName tableName, long reopenBatchBackoffMillis, - int reopenBatchSize) { - this(tableName, Collections.emptyList(), reopenBatchBackoffMillis, reopenBatchSize); + int reopenBatchSizeMax) { + this(tableName, Collections.emptyList(), reopenBatchBackoffMillis, reopenBatchSizeMax); } public ReopenTableRegionsProcedure(final TableName tableName, final List regionNames, - long reopenBatchBackoffMillis, int reopenBatchSize) { + long reopenBatchBackoffMillis, int reopenBatchSizeMax) { this.tableName = tableName; this.regionNames = regionNames; this.reopenBatchBackoffMillis = reopenBatchBackoffMillis; - this.reopenBatchSize = reopenBatchSize; + this.reopenBatchSize = 1; + this.reopenBatchSizeMax = Math.max(reopenBatchSizeMax, MINIMUM_BATCH_SIZE_MAX); } @Override @@ -113,8 +122,14 @@ public TableOperationType getTableOperationType() { @RestrictedApi(explanation = "Should only be called in tests", link = "", allowedOnPath = ".*/src/test/.*") - public List getCurrentRegionBatch() { - return new ArrayList<>(currentRegionBatch); + public long getRegionsReopened() { + return regionsReopened; + } + + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + public long getBatchesProcessed() { + return batchesProcessed; } private boolean canSchedule(MasterProcedureEnv env, HRegionLocation loc) { @@ -144,6 +159,9 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); return Flow.HAS_MORE_STATE; case REOPEN_TABLE_REGIONS_REOPEN_REGIONS: + if (!regions.isEmpty()) { + batchesProcessed++; + } currentRegionBatch = regions.stream().limit(reopenBatchSize).collect(Collectors.toList()); for (HRegionLocation loc : currentRegionBatch) { RegionStateNode regionNode = @@ -164,6 +182,7 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState regionNode.unlock(); } addChildProcedure(proc); + regionsReopened++; } setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_CONFIRM_REOPENED); return Flow.HAS_MORE_STATE; @@ -183,19 +202,22 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState return Flow.NO_MORE_STATE; } else { setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); + reopenBatchSize = Math.min(reopenBatchSizeMax, 2 * reopenBatchSize); if (reopenBatchBackoffMillis > 0) { - backoff(reopenBatchBackoffMillis); + setBackoffStateAndSuspend(reopenBatchBackoffMillis); + } else { + return Flow.HAS_MORE_STATE; } - return Flow.HAS_MORE_STATE; } } if (currentRegionBatch.stream().anyMatch(loc -> canSchedule(env, loc))) { retryCounter = null; setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); if (reopenBatchBackoffMillis > 0) { - backoff(reopenBatchBackoffMillis); + setBackoffStateAndSuspend(reopenBatchBackoffMillis); + } else { + return Flow.HAS_MORE_STATE; } - return Flow.HAS_MORE_STATE; } // We can not schedule TRSP for all the regions need to reopen, wait for a while and retry // again. @@ -207,14 +229,13 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState "There are still {} region(s) which need to be reopened for table {}. {} are in " + "OPENING state, suspend {}secs and try again later", regions.size(), tableName, currentRegionBatch.size(), backoffMillis / 1000); - backoff(backoffMillis); - throw new ProcedureSuspendedException(); + setBackoffStateAndSuspend(backoffMillis); default: throw new UnsupportedOperationException("unhandled state=" + state); } } - private void backoff(long millis) throws ProcedureSuspendedException { + private void setBackoffStateAndSuspend(long millis) throws ProcedureSuspendedException { setTimeout(Math.toIntExact(millis)); setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); skipPersistence(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java index 583b09e9ae48..aab981ddb524 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java @@ -27,7 +27,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; -import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.master.RegionState.State; @@ -60,7 +59,7 @@ public class TestReopenTableRegionsProcedureBatching { private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); private static final int BACKOFF_MILLIS_PER_RS = 0; - private static final int REOPEN_BATCH_SIZE = 1; + private static final int REOPEN_BATCH_SIZE_MAX = 1; private static TableName TABLE_NAME = TableName.valueOf("Batching"); @@ -80,7 +79,7 @@ public static void tearDown() throws Exception { } @Test - public void testRegionBatching() throws IOException { + public void testSmallMaxBatchSize() throws IOException { AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); ProcedureExecutor procExec = UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); @@ -89,15 +88,23 @@ public void testRegionBatching() throws IOException { Set stuckRegions = regions.stream().map(r -> stickRegion(am, procExec, r)).collect(Collectors.toSet()); ReopenTableRegionsProcedure proc = - new ReopenTableRegionsProcedure(TABLE_NAME, BACKOFF_MILLIS_PER_RS, REOPEN_BATCH_SIZE); + new ReopenTableRegionsProcedure(TABLE_NAME, BACKOFF_MILLIS_PER_RS, REOPEN_BATCH_SIZE_MAX); procExec.submitProcedure(proc); UTIL.waitFor(10000, () -> proc.getState() == ProcedureState.WAITING_TIMEOUT); - confirmBatchSize(REOPEN_BATCH_SIZE, stuckRegions, proc); + + // the first batch should be small + confirmBatchSize(1, stuckRegions, proc); ProcedureSyncWait.waitForProcedureToComplete(procExec, proc, 60_000); + + // other batches should also be small + assertTrue(proc.getBatchesProcessed() >= regions.size()); + + // all regions should only be opened once + assertEquals(proc.getRegionsReopened(), regions.size()); } @Test - public void testNoRegionBatching() throws IOException { + public void testDefaultMaxBatchSize() throws IOException { AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); ProcedureExecutor procExec = UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); @@ -108,19 +115,51 @@ public void testNoRegionBatching() throws IOException { ReopenTableRegionsProcedure proc = new ReopenTableRegionsProcedure(TABLE_NAME); procExec.submitProcedure(proc); UTIL.waitFor(10000, () -> proc.getState() == ProcedureState.WAITING_TIMEOUT); - confirmBatchSize(regions.size(), stuckRegions, proc); + + // the first batch should be small + confirmBatchSize(1, stuckRegions, proc); ProcedureSyncWait.waitForProcedureToComplete(procExec, proc, 60_000); + + // other batches should get larger + assertTrue(proc.getBatchesProcessed() < regions.size()); + + // all regions should only be opened once + assertEquals(proc.getRegionsReopened(), regions.size()); + } + + @Test + public void testNegativeBatchSizeDoesNotBreak() throws IOException { + AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); + ProcedureExecutor procExec = + UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); + List regions = UTIL.getAdmin().getRegions(TABLE_NAME); + assertTrue(2 <= regions.size()); + Set stuckRegions = + regions.stream().map(r -> stickRegion(am, procExec, r)).collect(Collectors.toSet()); + ReopenTableRegionsProcedure proc = + new ReopenTableRegionsProcedure(TABLE_NAME, BACKOFF_MILLIS_PER_RS, -1); + procExec.submitProcedure(proc); + UTIL.waitFor(10000, () -> proc.getState() == ProcedureState.WAITING_TIMEOUT); + + // the first batch should be small + confirmBatchSize(1, stuckRegions, proc); + ProcedureSyncWait.waitForProcedureToComplete(procExec, proc, 60_000); + + // other batches should also be small + assertTrue(proc.getBatchesProcessed() >= regions.size()); + + // all regions should only be opened once + assertEquals(proc.getRegionsReopened(), regions.size()); } private void confirmBatchSize(int expectedBatchSize, Set stuckRegions, ReopenTableRegionsProcedure proc) { while (true) { - List currentRegionBatch = proc.getCurrentRegionBatch(); - if (currentRegionBatch.isEmpty()) { + if (proc.getBatchesProcessed() == 0) { continue; } stuckRegions.forEach(this::unstickRegion); - assertEquals(expectedBatchSize, currentRegionBatch.size()); + UTIL.waitFor(5000, () -> expectedBatchSize == proc.getRegionsReopened()); break; } } From afa02f9f16635d3e6985cb27c5d1295fc2efc5ea Mon Sep 17 00:00:00 2001 From: Ray Mattingly Date: Wed, 29 Nov 2023 15:14:02 -0500 Subject: [PATCH 5/9] simplify --- .../ReopenTableRegionsProcedure.java | 99 ++++++++++--------- 1 file changed, 53 insertions(+), 46 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index 267b521df65f..52d4aa3ad589 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -22,7 +22,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.Set; import java.util.stream.Collectors; import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.TableName; @@ -159,10 +158,12 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); return Flow.HAS_MORE_STATE; case REOPEN_TABLE_REGIONS_REOPEN_REGIONS: - if (!regions.isEmpty()) { + // if we didn't finish reopening the last batch yet, let's keep trying until we do. + // at that point, the batch will be empty and we can generate a new batch + if (!regions.isEmpty() && currentRegionBatch.isEmpty()) { + currentRegionBatch = regions.stream().limit(reopenBatchSize).collect(Collectors.toList()); batchesProcessed++; } - currentRegionBatch = regions.stream().limit(reopenBatchSize).collect(Collectors.toList()); for (HRegionLocation loc : currentRegionBatch) { RegionStateNode regionNode = env.getAssignmentManager().getRegionStates().getRegionStateNode(loc.getRegion()); @@ -187,59 +188,65 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_CONFIRM_REOPENED); return Flow.HAS_MORE_STATE; case REOPEN_TABLE_REGIONS_CONFIRM_REOPENED: - regions = regions.stream().map(env.getAssignmentManager().getRegionStates()::checkReopened) - .filter(l -> l != null).collect(Collectors.toList()); - // we need to create a set of region names because the HRegionLocation hashcode is only - // based - // on the server name - Set currentRegionBatchNames = currentRegionBatch.stream() - .map(r -> r.getRegion().getRegionName()).collect(Collectors.toSet()); - currentRegionBatch = regions.stream() - .filter(r -> currentRegionBatchNames.contains(r.getRegion().getRegionName())) - .collect(Collectors.toList()); - if (currentRegionBatch.isEmpty()) { - if (regions.isEmpty()) { - return Flow.NO_MORE_STATE; - } else { - setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); - reopenBatchSize = Math.min(reopenBatchSizeMax, 2 * reopenBatchSize); - if (reopenBatchBackoffMillis > 0) { - setBackoffStateAndSuspend(reopenBatchBackoffMillis); - } else { - return Flow.HAS_MORE_STATE; - } - } + // update region lists based on what's been reopened + regions = filterReopened(env, regions); + currentRegionBatch = filterReopened(env, currentRegionBatch); + + // existing batch didn't fully reopen, so try to resolve that first. + // since this is a retry, don't do the batch backoff + if (!currentRegionBatch.isEmpty()) { + return reopenIfSchedulable(env, currentRegionBatch, false); } - if (currentRegionBatch.stream().anyMatch(loc -> canSchedule(env, loc))) { - retryCounter = null; - setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); - if (reopenBatchBackoffMillis > 0) { - setBackoffStateAndSuspend(reopenBatchBackoffMillis); - } else { - return Flow.HAS_MORE_STATE; - } - } - // We can not schedule TRSP for all the regions need to reopen, wait for a while and retry - // again. - if (retryCounter == null) { - retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration()); + + if (regions.isEmpty()) { + return Flow.NO_MORE_STATE; } - long backoffMillis = retryCounter.getBackoffTimeAndIncrementAttempts(); - LOG.info( - "There are still {} region(s) which need to be reopened for table {}. {} are in " - + "OPENING state, suspend {}secs and try again later", - regions.size(), tableName, currentRegionBatch.size(), backoffMillis / 1000); - setBackoffStateAndSuspend(backoffMillis); + + // current batch is finished, schedule more regions + return reopenIfSchedulable(env, regions, true); default: throw new UnsupportedOperationException("unhandled state=" + state); } } - private void setBackoffStateAndSuspend(long millis) throws ProcedureSuspendedException { + private List filterReopened(MasterProcedureEnv env, + List regionsToCheck) { + return regionsToCheck.stream().map(env.getAssignmentManager().getRegionStates()::checkReopened) + .filter(l -> l != null).collect(Collectors.toList()); + } + + private Flow reopenIfSchedulable(MasterProcedureEnv env, List regionsToReopen, + boolean shouldBatchBackoff) throws ProcedureSuspendedException { + if (regionsToReopen.stream().anyMatch(loc -> canSchedule(env, loc))) { + retryCounter = null; + setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); + reopenBatchSize = Math.min(reopenBatchSizeMax, 2 * reopenBatchSize); + if (shouldBatchBackoff && reopenBatchBackoffMillis > 0) { + setBackoffState(reopenBatchBackoffMillis); + throw new ProcedureSuspendedException(); + } else { + return Flow.HAS_MORE_STATE; + } + } + + // We can not schedule TRSP for all the regions need to reopen, wait for a while and retry + // again. + if (retryCounter == null) { + retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration()); + } + long backoffMillis = retryCounter.getBackoffTimeAndIncrementAttempts(); + LOG.info( + "There are still {} region(s) which need to be reopened for table {}. {} are in " + + "OPENING state, suspend {}secs and try again later", + regions.size(), tableName, currentRegionBatch.size(), backoffMillis / 1000); + setBackoffState(backoffMillis); + throw new ProcedureSuspendedException(); + } + + private void setBackoffState(long millis) { setTimeout(Math.toIntExact(millis)); setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); skipPersistence(); - throw new ProcedureSuspendedException(); } private List From b8e21c9ef2a04892242e59bd729b8e38265ee835 Mon Sep 17 00:00:00 2001 From: Ray Mattingly Date: Wed, 29 Nov 2023 22:29:17 -0500 Subject: [PATCH 6/9] pr feedback: default to no progression, change names --- .../procedure/ModifyTableProcedure.java | 14 ++++++------ .../ReopenTableRegionsProcedure.java | 22 ++++++++++--------- ...stReopenTableRegionsProcedureBatching.java | 7 ++---- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java index ff1f59da38b7..8ba62b35fb2f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java @@ -17,10 +17,10 @@ */ package org.apache.hadoop.hbase.master.procedure; -import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT; -import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_BACKOFF_MILLIS_KEY; -import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_SIZE_MAX_DEFAULT; -import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.REOPEN_BATCH_SIZE_MAX_KEY; +import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.PROGRESSIVE_BATCH_BACKOFF_MILLIS_DEFAULT; +import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.PROGRESSIVE_BATCH_BACKOFF_MILLIS_KEY; +import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT; +import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.PROGRESSIVE_BATCH_SIZE_MAX_KEY; import java.io.IOException; import java.util.Arrays; @@ -154,10 +154,10 @@ protected Flow executeFromState(final MasterProcedureEnv env, final ModifyTableS case MODIFY_TABLE_REOPEN_ALL_REGIONS: if (isTableEnabled(env)) { Configuration conf = env.getMasterConfiguration(); - long backoffMillis = - conf.getLong(REOPEN_BATCH_BACKOFF_MILLIS_KEY, REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT); + long backoffMillis = conf.getLong(PROGRESSIVE_BATCH_BACKOFF_MILLIS_KEY, + PROGRESSIVE_BATCH_BACKOFF_MILLIS_DEFAULT); int batchSizeMax = - conf.getInt(REOPEN_BATCH_SIZE_MAX_KEY, REOPEN_BATCH_SIZE_MAX_DEFAULT); + conf.getInt(PROGRESSIVE_BATCH_SIZE_MAX_KEY, PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT); addChildProcedure( new ReopenTableRegionsProcedure(getTableName(), backoffMillis, batchSizeMax)); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index 52d4aa3ad589..2d9f2f747268 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -54,12 +54,12 @@ public class ReopenTableRegionsProcedure private static final Logger LOG = LoggerFactory.getLogger(ReopenTableRegionsProcedure.class); - public static final String REOPEN_BATCH_BACKOFF_MILLIS_KEY = - "hbase.table.regions.reopen.batch.backoff.ms"; - public static final long REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT = 0L; - public static final String REOPEN_BATCH_SIZE_MAX_KEY = - "hbase.table.regions.reopen.batch.size.max"; - public static final int REOPEN_BATCH_SIZE_MAX_DEFAULT = Integer.MAX_VALUE; + public static final String PROGRESSIVE_BATCH_BACKOFF_MILLIS_KEY = + "hbase.reopen.table.regions.progressive.batch.backoff.ms"; + public static final long PROGRESSIVE_BATCH_BACKOFF_MILLIS_DEFAULT = 0L; + public static final String PROGRESSIVE_BATCH_SIZE_MAX_KEY = + "hbase.reopen.table.regions.progressive.batch.size.max"; + public static final int PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT = Integer.MAX_VALUE; // this minimum prevents a max which would break this procedure private static final int MINIMUM_BATCH_SIZE_MAX = 1; @@ -91,8 +91,8 @@ public ReopenTableRegionsProcedure(TableName tableName) { } public ReopenTableRegionsProcedure(final TableName tableName, final List regionNames) { - this(tableName, regionNames, REOPEN_BATCH_BACKOFF_MILLIS_DEFAULT, - REOPEN_BATCH_SIZE_MAX_DEFAULT); + this(tableName, regionNames, PROGRESSIVE_BATCH_BACKOFF_MILLIS_DEFAULT, + PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT); } public ReopenTableRegionsProcedure(final TableName tableName, long reopenBatchBackoffMillis, @@ -105,7 +105,9 @@ public ReopenTableRegionsProcedure(final TableName tableName, final List this.tableName = tableName; this.regionNames = regionNames; this.reopenBatchBackoffMillis = reopenBatchBackoffMillis; - this.reopenBatchSize = 1; + this.reopenBatchSize = reopenBatchSizeMax != PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT + ? 1 + : PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT; this.reopenBatchSizeMax = Math.max(reopenBatchSizeMax, MINIMUM_BATCH_SIZE_MAX); } @@ -220,8 +222,8 @@ private Flow reopenIfSchedulable(MasterProcedureEnv env, List r if (regionsToReopen.stream().anyMatch(loc -> canSchedule(env, loc))) { retryCounter = null; setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); - reopenBatchSize = Math.min(reopenBatchSizeMax, 2 * reopenBatchSize); if (shouldBatchBackoff && reopenBatchBackoffMillis > 0) { + reopenBatchSize = Math.min(reopenBatchSizeMax, 2 * reopenBatchSize); setBackoffState(reopenBatchBackoffMillis); throw new ProcedureSuspendedException(); } else { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java index aab981ddb524..d98d2f4fe926 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java @@ -116,13 +116,10 @@ public void testDefaultMaxBatchSize() throws IOException { procExec.submitProcedure(proc); UTIL.waitFor(10000, () -> proc.getState() == ProcedureState.WAITING_TIMEOUT); - // the first batch should be small - confirmBatchSize(1, stuckRegions, proc); + // the first batch should be large + confirmBatchSize(regions.size(), stuckRegions, proc); ProcedureSyncWait.waitForProcedureToComplete(procExec, proc, 60_000); - // other batches should get larger - assertTrue(proc.getBatchesProcessed() < regions.size()); - // all regions should only be opened once assertEquals(proc.getRegionsReopened(), regions.size()); } From 8a9f8f40177361792a2693611c9bf370857fcdb0 Mon Sep 17 00:00:00 2001 From: Ray Mattingly Date: Fri, 1 Dec 2023 15:36:56 -0500 Subject: [PATCH 7/9] use -1 as batching disabled value --- .../hbase/master/procedure/ModifyTableProcedure.java | 4 ++-- .../master/procedure/ReopenTableRegionsProcedure.java | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java index 8ba62b35fb2f..91c237b74f4f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java @@ -19,7 +19,7 @@ import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.PROGRESSIVE_BATCH_BACKOFF_MILLIS_DEFAULT; import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.PROGRESSIVE_BATCH_BACKOFF_MILLIS_KEY; -import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT; +import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.PROGRESSIVE_BATCH_SIZE_MAX_DISABLED; import static org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure.PROGRESSIVE_BATCH_SIZE_MAX_KEY; import java.io.IOException; @@ -157,7 +157,7 @@ protected Flow executeFromState(final MasterProcedureEnv env, final ModifyTableS long backoffMillis = conf.getLong(PROGRESSIVE_BATCH_BACKOFF_MILLIS_KEY, PROGRESSIVE_BATCH_BACKOFF_MILLIS_DEFAULT); int batchSizeMax = - conf.getInt(PROGRESSIVE_BATCH_SIZE_MAX_KEY, PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT); + conf.getInt(PROGRESSIVE_BATCH_SIZE_MAX_KEY, PROGRESSIVE_BATCH_SIZE_MAX_DISABLED); addChildProcedure( new ReopenTableRegionsProcedure(getTableName(), backoffMillis, batchSizeMax)); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index 2d9f2f747268..7a9d0f655190 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -59,7 +59,8 @@ public class ReopenTableRegionsProcedure public static final long PROGRESSIVE_BATCH_BACKOFF_MILLIS_DEFAULT = 0L; public static final String PROGRESSIVE_BATCH_SIZE_MAX_KEY = "hbase.reopen.table.regions.progressive.batch.size.max"; - public static final int PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT = Integer.MAX_VALUE; + public static final int PROGRESSIVE_BATCH_SIZE_MAX_DISABLED = -1; + private static final int PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT_VALUE = Integer.MAX_VALUE; // this minimum prevents a max which would break this procedure private static final int MINIMUM_BATCH_SIZE_MAX = 1; @@ -92,7 +93,7 @@ public ReopenTableRegionsProcedure(TableName tableName) { public ReopenTableRegionsProcedure(final TableName tableName, final List regionNames) { this(tableName, regionNames, PROGRESSIVE_BATCH_BACKOFF_MILLIS_DEFAULT, - PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT); + PROGRESSIVE_BATCH_SIZE_MAX_DISABLED); } public ReopenTableRegionsProcedure(final TableName tableName, long reopenBatchBackoffMillis, @@ -105,9 +106,9 @@ public ReopenTableRegionsProcedure(final TableName tableName, final List this.tableName = tableName; this.regionNames = regionNames; this.reopenBatchBackoffMillis = reopenBatchBackoffMillis; - this.reopenBatchSize = reopenBatchSizeMax != PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT + this.reopenBatchSize = reopenBatchSizeMax != PROGRESSIVE_BATCH_SIZE_MAX_DISABLED ? 1 - : PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT; + : PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT_VALUE; this.reopenBatchSizeMax = Math.max(reopenBatchSizeMax, MINIMUM_BATCH_SIZE_MAX); } From 08991b2aa3740732a88cdc48facf41210c9251ed Mon Sep 17 00:00:00 2001 From: Ray Mattingly Date: Fri, 1 Dec 2023 17:10:13 -0500 Subject: [PATCH 8/9] simplify, protect against overflow --- .../ReopenTableRegionsProcedure.java | 25 +++++++++++++++---- ...stReopenTableRegionsProcedureBatching.java | 13 +++++++++- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index 7a9d0f655190..d50fa49506dd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -106,10 +106,13 @@ public ReopenTableRegionsProcedure(final TableName tableName, final List this.tableName = tableName; this.regionNames = regionNames; this.reopenBatchBackoffMillis = reopenBatchBackoffMillis; - this.reopenBatchSize = reopenBatchSizeMax != PROGRESSIVE_BATCH_SIZE_MAX_DISABLED - ? 1 - : PROGRESSIVE_BATCH_SIZE_MAX_DEFAULT_VALUE; - this.reopenBatchSizeMax = Math.max(reopenBatchSizeMax, MINIMUM_BATCH_SIZE_MAX); + if (reopenBatchSizeMax == PROGRESSIVE_BATCH_SIZE_MAX_DISABLED) { + this.reopenBatchSize = Integer.MAX_VALUE; + this.reopenBatchSizeMax = Integer.MAX_VALUE; + } else { + this.reopenBatchSize = 1; + this.reopenBatchSizeMax = Math.max(reopenBatchSizeMax, MINIMUM_BATCH_SIZE_MAX); + } } @Override @@ -134,6 +137,18 @@ public long getBatchesProcessed() { return batchesProcessed; } + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + protected int progressBatchSize() { + int previousBatchSize = reopenBatchSize; + reopenBatchSize = Math.min(reopenBatchSizeMax, 2 * reopenBatchSize); + if (reopenBatchSize < previousBatchSize) { + // the batch size should never decrease. this must be overflow, so just use max + reopenBatchSize = reopenBatchSizeMax; + } + return reopenBatchSize; + } + private boolean canSchedule(MasterProcedureEnv env, HRegionLocation loc) { if (loc.getSeqNum() < 0) { return false; @@ -224,7 +239,7 @@ private Flow reopenIfSchedulable(MasterProcedureEnv env, List r retryCounter = null; setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); if (shouldBatchBackoff && reopenBatchBackoffMillis > 0) { - reopenBatchSize = Math.min(reopenBatchSizeMax, 2 * reopenBatchSize); + progressBatchSize(); setBackoffState(reopenBatchBackoffMillis); throw new ProcedureSuspendedException(); } else { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java index d98d2f4fe926..8ea9b3c6a309 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestReopenTableRegionsProcedureBatching.java @@ -134,7 +134,7 @@ public void testNegativeBatchSizeDoesNotBreak() throws IOException { Set stuckRegions = regions.stream().map(r -> stickRegion(am, procExec, r)).collect(Collectors.toSet()); ReopenTableRegionsProcedure proc = - new ReopenTableRegionsProcedure(TABLE_NAME, BACKOFF_MILLIS_PER_RS, -1); + new ReopenTableRegionsProcedure(TABLE_NAME, BACKOFF_MILLIS_PER_RS, -100); procExec.submitProcedure(proc); UTIL.waitFor(10000, () -> proc.getState() == ProcedureState.WAITING_TIMEOUT); @@ -149,6 +149,17 @@ public void testNegativeBatchSizeDoesNotBreak() throws IOException { assertEquals(proc.getRegionsReopened(), regions.size()); } + @Test + public void testBatchSizeDoesNotOverflow() { + ReopenTableRegionsProcedure proc = + new ReopenTableRegionsProcedure(TABLE_NAME, BACKOFF_MILLIS_PER_RS, Integer.MAX_VALUE); + int currentBatchSize = 1; + while (currentBatchSize < Integer.MAX_VALUE) { + currentBatchSize = proc.progressBatchSize(); + assertTrue(currentBatchSize > 0); + } + } + private void confirmBatchSize(int expectedBatchSize, Set stuckRegions, ReopenTableRegionsProcedure proc) { while (true) { From 0d010a0e29865fcc20ab82ca77b3c46110b0ab09 Mon Sep 17 00:00:00 2001 From: Ray Mattingly Date: Sun, 3 Dec 2023 10:06:10 -0500 Subject: [PATCH 9/9] update RestrictedApi annotation --- .../hbase/master/procedure/ReopenTableRegionsProcedure.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index d50fa49506dd..353636e6ddd6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -137,8 +137,8 @@ public long getBatchesProcessed() { return batchesProcessed; } - @RestrictedApi(explanation = "Should only be called in tests", link = "", - allowedOnPath = ".*/src/test/.*") + @RestrictedApi(explanation = "Should only be called internally or in tests", link = "", + allowedOnPath = ".*(/src/test/.*|ReopenTableRegionsProcedure).java") protected int progressBatchSize() { int previousBatchSize = reopenBatchSize; reopenBatchSize = Math.min(reopenBatchSizeMax, 2 * reopenBatchSize);