diff --git a/src/yb/consensus/raft_consensus.cc b/src/yb/consensus/raft_consensus.cc index 4d821860a4a1..84ad86922765 100644 --- a/src/yb/consensus/raft_consensus.cc +++ b/src/yb/consensus/raft_consensus.cc @@ -2177,6 +2177,9 @@ Status RaftConsensus::WaitForLeaderLeaseImprecise(MonoTime deadline) { { ReplicaState::UniqueLock lock; RETURN_NOT_OK(state_->LockForRead(&lock)); + if (state_->GetActiveRoleUnlocked() != RaftPeerPB::LEADER) { + return STATUS_FORMAT(IllegalState, "Not the leader: $0", state_->GetActiveRoleUnlocked()); + } leader_lease_status = state_->GetLeaderLeaseStatusUnlocked(&remaining_old_leader_lease); } switch (leader_lease_status) { diff --git a/src/yb/master/catalog_manager-internal.h b/src/yb/master/catalog_manager-internal.h index 12a7f02a635c..39cde02c0067 100644 --- a/src/yb/master/catalog_manager-internal.h +++ b/src/yb/master/catalog_manager-internal.h @@ -93,15 +93,15 @@ template bool CatalogManager::ScopedLeaderSharedLock::CheckIsInitializedAndIsLeaderOrRespondInternal( RespClass* resp, rpc::RpcContext* rpc) { - Status& s = catalog_status_; - if (PREDICT_TRUE(s.ok())) { - s = leader_status_; - if (PREDICT_TRUE(s.ok())) { + const Status* status = &catalog_status_; + if (PREDICT_TRUE(status->ok())) { + status = &leader_status_; + if (PREDICT_TRUE(status->ok())) { return true; } } - StatusToPB(s, resp->mutable_error()->mutable_status()); + StatusToPB(*status, resp->mutable_error()->mutable_status()); resp->mutable_error()->set_code(ErrorClass::NOT_THE_LEADER); rpc->RespondSuccess(); return false; diff --git a/src/yb/master/catalog_manager.cc b/src/yb/master/catalog_manager.cc index dac177aa9985..0eed35b4edd5 100644 --- a/src/yb/master/catalog_manager.cc +++ b/src/yb/master/catalog_manager.cc @@ -110,6 +110,7 @@ #include "yb/tablet/tablet_metadata.h" #include "yb/yql/redis/redisserver/redis_constants.h" #include "yb/tserver/tserver_admin.proxy.h" + #include "yb/util/crypt.h" #include "yb/util/debug/trace_event.h" #include "yb/util/flag_tags.h" @@ -123,9 +124,13 @@ #include "yb/util/thread_restrictions.h" #include "yb/util/threadpool.h" #include "yb/util/trace.h" +#include "yb/util/tsan_util.h" #include "yb/util/uuid.h" + #include "yb/tserver/remote_bootstrap_client.h" +using namespace std::literals; + DEFINE_int32(master_ts_rpc_timeout_ms, 30 * 1000, // 30 sec "Timeout used for the Master->TS async rpc calls."); TAG_FLAG(master_ts_rpc_timeout_ms, advanced); @@ -237,6 +242,8 @@ using master::MasterServiceProxy; using yb::util::kBcryptHashSize; using yb::util::bcrypt_hashpw; +const auto kLockLongLimit = RegularBuildVsSanitizers(100ms, 750ms); + constexpr const char* const kDefaultCassandraUsername = "cassandra"; constexpr const char* const kDefaultCassandraPassword = "cassandra"; @@ -644,6 +651,7 @@ void CatalogManagerBgTasks::Run() { // - CreateTable will call Wake() to notify about the tablets to add // - HandleReportedTablet/ProcessPendingAssignments will call WakeIfHasPendingUpdates() // to notify about tablets creation. + l.Unlock(); Wait(FLAGS_catalog_manager_bg_task_wait_ms); } VLOG(1) << "Catalog manager background task thread shutting down"; @@ -799,7 +807,12 @@ Status CatalogManager::VisitSysCatalog() { // Block new catalog operations, and wait for existing operations to finish. LOG(INFO) << __func__ << ": Wait on leader_lock_ for any existing operations to finish."; + auto start = std::chrono::steady_clock::now(); std::lock_guard leader_lock_guard(leader_lock_); + auto finish = std::chrono::steady_clock::now(); + if (finish > start + kLockLongLimit) { + LOG(ERROR) << "Long wait on leader_lock_: " << yb::ToString(finish - start); + } LOG(INFO) << __func__ << ": Acquire catalog manager lock_ before loading sys catalog.."; boost::lock_guard lock(lock_); @@ -5402,7 +5415,8 @@ void CatalogManager::AbortAndWaitForAllTasks(const vectorleader_lock_, std::try_to_lock) { + leader_shared_lock_(catalog->leader_lock_, std::try_to_lock), + start_(std::chrono::steady_clock::now()) { // Check if the catalog manager is running. std::lock_guard l(catalog_->state_lock_); @@ -5455,6 +5469,20 @@ CatalogManager::ScopedLeaderSharedLock::ScopedLeaderSharedLock(CatalogManager* c } } +void CatalogManager::ScopedLeaderSharedLock::Unlock() { + if (leader_shared_lock_.owns_lock()) { + { + decltype(leader_shared_lock_) lock; + lock.swap(leader_shared_lock_); + } + auto finish = std::chrono::steady_clock::now(); + if (finish > start_ + kLockLongLimit) { + LOG(ERROR) << "Long lock of catalog manager: " << yb::ToString(finish - start_) << "\n" + << GetStackTrace(); + } + } +} + //////////////////////////////////////////////////////////// // TabletInfo //////////////////////////////////////////////////////////// diff --git a/src/yb/master/catalog_manager.h b/src/yb/master/catalog_manager.h index 4482f12c4de7..b2aa6af102d4 100644 --- a/src/yb/master/catalog_manager.h +++ b/src/yb/master/catalog_manager.h @@ -678,6 +678,10 @@ class CatalogManager : public tserver::TabletPeerLookupIf { // 'catalog' must outlive this object. explicit ScopedLeaderSharedLock(CatalogManager* catalog); + ~ScopedLeaderSharedLock() { Unlock(); } + + void Unlock(); + // General status of the catalog manager. If not OK (e.g. the catalog // manager is still being initialized), all operations are illegal and // leader_status() should not be trusted. @@ -727,6 +731,7 @@ class CatalogManager : public tserver::TabletPeerLookupIf { shared_lock leader_shared_lock_; Status catalog_status_; Status leader_status_; + std::chrono::steady_clock::time_point start_; }; explicit CatalogManager(Master *master); diff --git a/src/yb/master/master.cc b/src/yb/master/master.cc index 8a4297ebc47a..a01e71f675ca 100644 --- a/src/yb/master/master.cc +++ b/src/yb/master/master.cc @@ -249,6 +249,8 @@ Status Master::WaitUntilCatalogManagerIsLeaderAndReadyForTests(const MonoDelta& if (l.catalog_status().ok() && l.leader_status().ok()) { return Status::OK(); } + l.Unlock(); + SleepFor(MonoDelta::FromMilliseconds(backoff_ms)); backoff_ms = min(backoff_ms << 1, kMaxBackoffMs); } while (MonoTime::Now().GetDeltaSince(start).LessThan(timeout));