Skip to content

Commit

Permalink
HBASE-27231 FSHLog should retry writing WAL entries when syncs to HDF…
Browse files Browse the repository at this point in the history
…S failed (#5317)

Co-authored-by: Duo Zhang <[email protected]>
Co-authored-by: chenglei <[email protected]>

Signed-off-by: chenglei <[email protected]>
Signed-off-by: Duo Zhang <[email protected]>
  • Loading branch information
Apache9 authored Jul 13, 2023
1 parent 1d704a7 commit 3353381
Show file tree
Hide file tree
Showing 11 changed files with 1,112 additions and 2,124 deletions.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
import org.apache.hadoop.hbase.testclassification.SmallTests;
Expand Down Expand Up @@ -93,6 +94,7 @@ public void setup() throws IOException {
CONF = TEST_UTIL.getConfiguration();
// Disable block cache.
CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
CONF.setLong(AbstractFSWAL.WAL_SYNC_TIMEOUT_MS, 10000);
dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
tableName = TableName.valueOf(name.getMethodName());
}
Expand Down Expand Up @@ -258,22 +260,16 @@ public void testLockupAroundBadAssignSync() throws IOException {
dodgyWAL.throwSyncException = true;
Put put = new Put(value);
put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("2"), value);
region.rsServices = services;
region.put(put);
} catch (IOException ioe) {
threwOnSync = true;
}
// An append in the WAL but the sync failed is a server abort condition. That is our
// current semantic. Verify. It takes a while for abort to be called. Just hang here till it
// happens. If it don't we'll timeout the whole test. That is fine.
while (true) {
try {
verify(services, atLeast(1)).abort(anyString(), any(Throwable.class));
break;
} catch (WantedButNotInvoked t) {
Threads.sleep(1);
}
}

region.rsServices = null;
// An append in the WAL but the sync failed is a server abort condition. That is our
// current semantic. Verify.
verify(services, atLeast(1)).abort(anyString(), any());
try {
dodgyWAL.throwAppendException = false;
dodgyWAL.throwSyncException = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@
import org.apache.hadoop.hbase.regionserver.Region.RowLock;
import org.apache.hadoop.hbase.regionserver.TestHStore.FaultyFileSystem;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionRequestImpl;
import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
import org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL;
import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
import org.apache.hadoop.hbase.regionserver.wal.MetricsWALSource;
import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
Expand Down Expand Up @@ -178,6 +180,7 @@
import org.junit.Assert;
import org.junit.Before;
import org.junit.ClassRule;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
Expand Down Expand Up @@ -260,6 +263,7 @@ public void setup() throws IOException {
method = name.getMethodName();
tableName = TableName.valueOf(method);
CONF.set(CompactingMemStore.IN_MEMORY_FLUSH_THRESHOLD_FACTOR_KEY, String.valueOf(0.09));
CONF.setLong(AbstractFSWAL.WAL_SYNC_TIMEOUT_MS, 10000);
}

@After
Expand Down Expand Up @@ -5415,7 +5419,14 @@ public void testPutWithMemStoreFlush() throws Exception {
assertArrayEquals(Bytes.toBytes("value1"), CellUtil.cloneValue(kvs.get(0)));
}

/**
* For this test,the spied {@link AsyncFSWAL} can not work properly because of a Mockito defect
* that can not deal with classes which have a field of an inner class. See discussions in
* HBASE-15536.When we reuse the code of {@link AsyncFSWAL} for {@link FSHLog}, this test could
* not work for {@link FSHLog} also.
*/
@Test
@Ignore
public void testDurability() throws Exception {
// there are 5 x 5 cases:
// table durability(SYNC,FSYNC,ASYC,SKIP,USE_DEFAULT) x mutation
Expand Down Expand Up @@ -5469,6 +5480,7 @@ private void durabilityTest(String method, Durability tableDurability,
Durability mutationDurability, long timeout, boolean expectAppend, final boolean expectSync,
final boolean expectSyncFromLogSyncer) throws Exception {
Configuration conf = HBaseConfiguration.create(CONF);
conf.setLong(AbstractFSWAL.WAL_SHUTDOWN_WAIT_TIMEOUT_MS, 60 * 60 * 1000);
method = method + "_" + tableDurability.name() + "_" + mutationDurability.name();
byte[] family = Bytes.toBytes("family");
Path logDir = new Path(new Path(dir + method), "log");
Expand Down
Loading

0 comments on commit 3353381

Please sign in to comment.