Skip to content

Commit

Permalink
[#23243] docdb: Fix tablet bootstrap stuck when replaying truncate op…
Browse files Browse the repository at this point in the history
…eration

Summary:
**Issue:**
Tablet bootstrap can run into a deadlock if it needs to replay a bootstrap operation. Here are the sequence of events leading to the deadlock

| Thread 1 (Main bootstrap thread) | Thread 2 (Load Transactions) |
| 1. Begin tablet bootstrap | |
| 2. During OpenTablet, if transaction is enabled(ysql table or ycql table with transaction enabled), acquire the the `start_latch` by setting its value to 1, and another thread(Thread 2) is created for transaction load| |
|  | 3. Execute transaction load, acquires `pending_op_counter_blocking_rocksdb_shutdown_start_` to prevent rocksdb shutdown |
| 4. Replay tablet truncate operation, waiting for `pending_op_counter_blocking_rocksdb_shutdown_start_` to be released in order to shutdown rocksdb | |
| | 5. Transaction load completed |
| | 6. Call `LoadFinished` function, it starts waiting for `start_latch` to be 0|
| 7. Bootstrap complete, release the the `start_latch` by setting its value to 0 | |
|  | 8. release the `pending_op_counter_blocking_rocksdb_shutdown_start_` |
Thread 1 stucks at step 4, waiting for step 8 to be executed.
Thread 2 stucks at step 6, waiting for step 7 to be executed.

Result: Tablet bootstrap stuck at replaying truncation operation. This issue starts happening since D29000 (commit id: 5159eb3), the diff Introduced a change to only destroy executor instance(which holds the operation counter) after FinishLoad.

**Fix:**
Reset `pending_op_counter_blocking_rocksdb_shutdown_start_` before calling `loader_.FinishLoad(status)`. This is not clean fix, but it guarantees the safety, because FinishLoad doesn't need protection from the op counter as it acquires own op counter when processing the pending applies.

**Affected Version 2.20.1**
Starting from D29000 (commit id: 5159eb3), tablet bootstrap will get stuck when replaying truncate operation.
Jira: DB-12175

Test Plan: ./yb_build.sh --cxx-test pgwrapper_pg_single_tserver-test --gtest_filter PgSingleTServerTest.BootstrapReplayTruncate

Reviewers: bkolagani, timur, sergei, mbautin, rthallam

Reviewed By: sergei, mbautin, rthallam

Subscribers: mbautin, yql, slingam, rthallam, ybase

Differential Revision: https://phorge.dev.yugabyte.com/D37152
  • Loading branch information
yusong-yan committed Aug 17, 2024
1 parent 15fd362 commit c8cbcbf
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 3 deletions.
9 changes: 9 additions & 0 deletions src/yb/tablet/operations/truncate_operation.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@

#include "yb/util/trace.h"

DEFINE_test_flag(bool, skip_applying_truncate, false,
"If true, the test will skip applying tablet truncate operation."
"Note that other operations will still be applied.");

namespace yb {
namespace tablet {

Expand All @@ -43,6 +47,11 @@ Status TruncateOperation::DoAborted(const Status& status) {
Status TruncateOperation::DoReplicated(int64_t leader_term, Status* complete_status) {
TRACE("APPLY TRUNCATE: started");

if (FLAGS_TEST_skip_applying_truncate) {
TRACE("APPLY TRUNCATE: skipped");
return Status::OK();
}

RETURN_NOT_OK(VERIFY_RESULT(tablet_safe())->Truncate(this));

TRACE("APPLY TRUNCATE: finished");
Expand Down
3 changes: 3 additions & 0 deletions src/yb/tablet/tablet_bootstrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1842,6 +1842,9 @@ class TabletBootstrap {

TruncateOperation operation(tablet_, req);

operation.set_op_id(OpId::FromPB(replicate_msg->id()));
operation.set_hybrid_time(HybridTime::FromPB(replicate_msg->hybrid_time()));

Status s = tablet_->Truncate(&operation);

RETURN_NOT_OK_PREPEND(s, "Failed to Truncate:");
Expand Down
2 changes: 2 additions & 0 deletions src/yb/tablet/transaction_loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ class TransactionLoader::Executor {
Status status;

auto se = ScopeExit([this, &status] {
scoped_pending_operation_.Reset();

loader_.FinishLoad(status);
// Destroy this executor object. Must be the last statement before we return from the Execute
// function.
Expand Down
10 changes: 8 additions & 2 deletions src/yb/tserver/ts_tablet_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -865,10 +865,16 @@ void TSTabletManager::CleanupSplitTablets() {
}
}

Status TSTabletManager::WaitForAllBootstrapsToFinish() {
Status TSTabletManager::WaitForAllBootstrapsToFinish(MonoDelta timeout) {
CHECK_EQ(state(), MANAGER_RUNNING);

open_tablet_pool_->Wait();
if (timeout.Initialized()) {
if (!open_tablet_pool_->WaitFor(timeout)) {
return STATUS(TimedOut, "Timeout waiting for all bootstraps to finish");
}
} else {
open_tablet_pool_->Wait();
}

Status s = Status::OK();

Expand Down
2 changes: 1 addition & 1 deletion src/yb/tserver/ts_tablet_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ class TSTabletManager : public tserver::TabletPeerLookupIf, public tablet::Table
// Returns Status::OK if all tablets bootstrapped successfully. If
// the bootstrap of any tablet failed returns the failure reason for
// the first tablet whose bootstrap failed.
Status WaitForAllBootstrapsToFinish();
Status WaitForAllBootstrapsToFinish(MonoDelta timeout = MonoDelta());

// Starts shutdown process.
void StartShutdown();
Expand Down
33 changes: 33 additions & 0 deletions src/yb/yql/pgwrapper/pg_single_tserver-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

#include "yb/common/colocated_util.h"

#include "yb/consensus/log.h"

#include "yb/tablet/tablet.h"
#include "yb/tablet/tablet_peer.h"
#include "yb/tablet/transaction_participant.h"
Expand Down Expand Up @@ -54,13 +56,16 @@ DECLARE_int64(db_block_cache_size_bytes);
DECLARE_int32(rocksdb_level0_file_num_compaction_trigger);
DECLARE_int32(rocksdb_max_write_buffer_number);
DECLARE_int32(max_prevs_to_avoid_seek);
DECLARE_bool(TEST_skip_applying_truncate);

METRIC_DECLARE_histogram(handler_latency_yb_tserver_TabletServerService_Read);
METRIC_DECLARE_histogram(handler_latency_yb_tserver_TabletServerService_Write);
METRIC_DECLARE_entity(table);

DEFINE_RUNTIME_int32(TEST_scan_reads, 3, "Number of reads in scan tests");

using namespace std::literals;

namespace yb {

extern int TEST_scan_trivial_expectation;
Expand Down Expand Up @@ -1364,6 +1369,34 @@ TEST_F(PgSingleTServerTest, RangeConflict) {
ASSERT_NOK(conn2.Execute("INSERT INTO t VALUES (0, 2), (1, 2)"));
}

TEST_F(PgSingleTServerTest, BootstrapReplayTruncate) {
auto conn = ASSERT_RESULT(Connect());
ASSERT_OK(conn.Execute(
"CREATE TABLE t(id INT PRIMARY KEY, s TEXT) SPLIT INTO 1 TABLETS;"));

ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_applying_truncate) = true;
ASSERT_OK(conn.Execute("TRUNCATE TABLE t;"));

// Rollover and flush the WAL, so that the truncate will be replayed during next restart.
auto peers = ListTabletPeers(cluster_.get(), ListPeersFilter::kAll);
for (const auto& peer : peers) {
if (peer->tablet()->transaction_participant()) {
ASSERT_OK(peer->log()->AllocateSegmentAndRollOver());
}
}

ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_applying_truncate) = false;

auto* ts = cluster_->mini_tablet_server(0);

ASSERT_OK(ts->Restart());

auto timeout = MonoDelta::FromSeconds(10);
if (!ts->server()->tablet_manager()->WaitForAllBootstrapsToFinish(timeout).ok()) {
LOG(FATAL) << "Tablet bootstrap didn't complete within within " << timeout.ToString();
}
}

TEST_F(PgSingleTServerTest, UpdateIndexWithHole) {
auto conn = ASSERT_RESULT(Connect());
ASSERT_OK(conn.Execute("CREATE TABLE t (id INT PRIMARY KEY, value INT)"));
Expand Down

0 comments on commit c8cbcbf

Please sign in to comment.