From c8cbcbf6648a9647cc98f028463c21f8eb39ae68 Mon Sep 17 00:00:00 2001 From: yusong-yan Date: Thu, 8 Aug 2024 01:20:52 +0000 Subject: [PATCH] [#23243] docdb: Fix tablet bootstrap stuck when replaying truncate operation Summary: **Issue:** Tablet bootstrap can run into a deadlock if it needs to replay a bootstrap operation. Here are the sequence of events leading to the deadlock | Thread 1 (Main bootstrap thread) | Thread 2 (Load Transactions) | | 1. Begin tablet bootstrap | | | 2. During OpenTablet, if transaction is enabled(ysql table or ycql table with transaction enabled), acquire the the `start_latch` by setting its value to 1, and another thread(Thread 2) is created for transaction load| | | | 3. Execute transaction load, acquires `pending_op_counter_blocking_rocksdb_shutdown_start_` to prevent rocksdb shutdown | | 4. Replay tablet truncate operation, waiting for `pending_op_counter_blocking_rocksdb_shutdown_start_` to be released in order to shutdown rocksdb | | | | 5. Transaction load completed | | | 6. Call `LoadFinished` function, it starts waiting for `start_latch` to be 0| | 7. Bootstrap complete, release the the `start_latch` by setting its value to 0 | | | | 8. release the `pending_op_counter_blocking_rocksdb_shutdown_start_` | Thread 1 stucks at step 4, waiting for step 8 to be executed. Thread 2 stucks at step 6, waiting for step 7 to be executed. Result: Tablet bootstrap stuck at replaying truncation operation. This issue starts happening since D29000 (commit id: 5159eb3), the diff Introduced a change to only destroy executor instance(which holds the operation counter) after FinishLoad. **Fix:** Reset `pending_op_counter_blocking_rocksdb_shutdown_start_` before calling `loader_.FinishLoad(status)`. This is not clean fix, but it guarantees the safety, because FinishLoad doesn't need protection from the op counter as it acquires own op counter when processing the pending applies. **Affected Version 2.20.1** Starting from D29000 (commit id: 5159eb3), tablet bootstrap will get stuck when replaying truncate operation. Jira: DB-12175 Test Plan: ./yb_build.sh --cxx-test pgwrapper_pg_single_tserver-test --gtest_filter PgSingleTServerTest.BootstrapReplayTruncate Reviewers: bkolagani, timur, sergei, mbautin, rthallam Reviewed By: sergei, mbautin, rthallam Subscribers: mbautin, yql, slingam, rthallam, ybase Differential Revision: https://phorge.dev.yugabyte.com/D37152 --- .../tablet/operations/truncate_operation.cc | 9 +++++ src/yb/tablet/tablet_bootstrap.cc | 3 ++ src/yb/tablet/transaction_loader.cc | 2 ++ src/yb/tserver/ts_tablet_manager.cc | 10 ++++-- src/yb/tserver/ts_tablet_manager.h | 2 +- .../yql/pgwrapper/pg_single_tserver-test.cc | 33 +++++++++++++++++++ 6 files changed, 56 insertions(+), 3 deletions(-) diff --git a/src/yb/tablet/operations/truncate_operation.cc b/src/yb/tablet/operations/truncate_operation.cc index 18dd9b1e1a4a..4a2c1be44d7d 100644 --- a/src/yb/tablet/operations/truncate_operation.cc +++ b/src/yb/tablet/operations/truncate_operation.cc @@ -22,6 +22,10 @@ #include "yb/util/trace.h" +DEFINE_test_flag(bool, skip_applying_truncate, false, + "If true, the test will skip applying tablet truncate operation." + "Note that other operations will still be applied."); + namespace yb { namespace tablet { @@ -43,6 +47,11 @@ Status TruncateOperation::DoAborted(const Status& status) { Status TruncateOperation::DoReplicated(int64_t leader_term, Status* complete_status) { TRACE("APPLY TRUNCATE: started"); + if (FLAGS_TEST_skip_applying_truncate) { + TRACE("APPLY TRUNCATE: skipped"); + return Status::OK(); + } + RETURN_NOT_OK(VERIFY_RESULT(tablet_safe())->Truncate(this)); TRACE("APPLY TRUNCATE: finished"); diff --git a/src/yb/tablet/tablet_bootstrap.cc b/src/yb/tablet/tablet_bootstrap.cc index 6f1b8f3a0361..6664f0af80ad 100644 --- a/src/yb/tablet/tablet_bootstrap.cc +++ b/src/yb/tablet/tablet_bootstrap.cc @@ -1842,6 +1842,9 @@ class TabletBootstrap { TruncateOperation operation(tablet_, req); + operation.set_op_id(OpId::FromPB(replicate_msg->id())); + operation.set_hybrid_time(HybridTime::FromPB(replicate_msg->hybrid_time())); + Status s = tablet_->Truncate(&operation); RETURN_NOT_OK_PREPEND(s, "Failed to Truncate:"); diff --git a/src/yb/tablet/transaction_loader.cc b/src/yb/tablet/transaction_loader.cc index cee93c9ddf07..a7be08d4865c 100644 --- a/src/yb/tablet/transaction_loader.cc +++ b/src/yb/tablet/transaction_loader.cc @@ -95,6 +95,8 @@ class TransactionLoader::Executor { Status status; auto se = ScopeExit([this, &status] { + scoped_pending_operation_.Reset(); + loader_.FinishLoad(status); // Destroy this executor object. Must be the last statement before we return from the Execute // function. diff --git a/src/yb/tserver/ts_tablet_manager.cc b/src/yb/tserver/ts_tablet_manager.cc index 8fe2dbbcc283..0e93f0c2fb67 100644 --- a/src/yb/tserver/ts_tablet_manager.cc +++ b/src/yb/tserver/ts_tablet_manager.cc @@ -865,10 +865,16 @@ void TSTabletManager::CleanupSplitTablets() { } } -Status TSTabletManager::WaitForAllBootstrapsToFinish() { +Status TSTabletManager::WaitForAllBootstrapsToFinish(MonoDelta timeout) { CHECK_EQ(state(), MANAGER_RUNNING); - open_tablet_pool_->Wait(); + if (timeout.Initialized()) { + if (!open_tablet_pool_->WaitFor(timeout)) { + return STATUS(TimedOut, "Timeout waiting for all bootstraps to finish"); + } + } else { + open_tablet_pool_->Wait(); + } Status s = Status::OK(); diff --git a/src/yb/tserver/ts_tablet_manager.h b/src/yb/tserver/ts_tablet_manager.h index ebeb5ead7cd5..b7429b5a97f0 100644 --- a/src/yb/tserver/ts_tablet_manager.h +++ b/src/yb/tserver/ts_tablet_manager.h @@ -159,7 +159,7 @@ class TSTabletManager : public tserver::TabletPeerLookupIf, public tablet::Table // Returns Status::OK if all tablets bootstrapped successfully. If // the bootstrap of any tablet failed returns the failure reason for // the first tablet whose bootstrap failed. - Status WaitForAllBootstrapsToFinish(); + Status WaitForAllBootstrapsToFinish(MonoDelta timeout = MonoDelta()); // Starts shutdown process. void StartShutdown(); diff --git a/src/yb/yql/pgwrapper/pg_single_tserver-test.cc b/src/yb/yql/pgwrapper/pg_single_tserver-test.cc index 8fb29cb1bb3f..2842c0b153bb 100644 --- a/src/yb/yql/pgwrapper/pg_single_tserver-test.cc +++ b/src/yb/yql/pgwrapper/pg_single_tserver-test.cc @@ -16,6 +16,8 @@ #include "yb/common/colocated_util.h" +#include "yb/consensus/log.h" + #include "yb/tablet/tablet.h" #include "yb/tablet/tablet_peer.h" #include "yb/tablet/transaction_participant.h" @@ -54,6 +56,7 @@ DECLARE_int64(db_block_cache_size_bytes); DECLARE_int32(rocksdb_level0_file_num_compaction_trigger); DECLARE_int32(rocksdb_max_write_buffer_number); DECLARE_int32(max_prevs_to_avoid_seek); +DECLARE_bool(TEST_skip_applying_truncate); METRIC_DECLARE_histogram(handler_latency_yb_tserver_TabletServerService_Read); METRIC_DECLARE_histogram(handler_latency_yb_tserver_TabletServerService_Write); @@ -61,6 +64,8 @@ METRIC_DECLARE_entity(table); DEFINE_RUNTIME_int32(TEST_scan_reads, 3, "Number of reads in scan tests"); +using namespace std::literals; + namespace yb { extern int TEST_scan_trivial_expectation; @@ -1364,6 +1369,34 @@ TEST_F(PgSingleTServerTest, RangeConflict) { ASSERT_NOK(conn2.Execute("INSERT INTO t VALUES (0, 2), (1, 2)")); } +TEST_F(PgSingleTServerTest, BootstrapReplayTruncate) { + auto conn = ASSERT_RESULT(Connect()); + ASSERT_OK(conn.Execute( + "CREATE TABLE t(id INT PRIMARY KEY, s TEXT) SPLIT INTO 1 TABLETS;")); + + ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_applying_truncate) = true; + ASSERT_OK(conn.Execute("TRUNCATE TABLE t;")); + + // Rollover and flush the WAL, so that the truncate will be replayed during next restart. + auto peers = ListTabletPeers(cluster_.get(), ListPeersFilter::kAll); + for (const auto& peer : peers) { + if (peer->tablet()->transaction_participant()) { + ASSERT_OK(peer->log()->AllocateSegmentAndRollOver()); + } + } + + ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_applying_truncate) = false; + + auto* ts = cluster_->mini_tablet_server(0); + + ASSERT_OK(ts->Restart()); + + auto timeout = MonoDelta::FromSeconds(10); + if (!ts->server()->tablet_manager()->WaitForAllBootstrapsToFinish(timeout).ok()) { + LOG(FATAL) << "Tablet bootstrap didn't complete within within " << timeout.ToString(); + } +} + TEST_F(PgSingleTServerTest, UpdateIndexWithHole) { auto conn = ASSERT_RESULT(Connect()); ASSERT_OK(conn.Execute("CREATE TABLE t (id INT PRIMARY KEY, value INT)"));