Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[BACKPORT 2024.1][#23747] MetaCache: Callback should not be called wh…
…ile holding the lock Summary: Original commit: c770d79 / D37706 Call callback in ScopeExit block only. Not while holding the lock. Without this fix, it is possible that a thread can get into a deadlock, trying to request a shared_lock on a mutex, while already holding an exclusive lock on the same mutex: This deadlock can be triggered if there are active read/write requests to a Table (from more than 1 thread) right after the table had a tablet-split. If there is only 1 thread, it is unlikely to run into the deadlock, as the thread notices -- as part of the callback -- that the table's partition info is stale. Having a different thread refresh the partition version before the main thread checks if the table version is stale, is likely necessary to trigger the stack trace seen below. e.g: ``` #0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 #1 0x00005640c3eb441b in std::__1::shared_timed_mutex::lock_shared() () #2 0x00005640c3ffcbff in yb::client::internal::MetaCache::LookupTabletByKey(std::__1::shared_ptr<yb::client::YBTable> const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::chrono::time_point<yb::CoarseMonoClock, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000000000l> > >, std::__1::function<void (yb::Result<scoped_refptr<yb::client::internal::RemoteTablet> > const&)>, yb::StronglyTypedBool<yb::client::internal::FailOnPartitionListRefreshed_Tag>) () #3 0x00005640c3f7549a in yb::client::internal::Batcher::LookupTabletFor(yb::client::internal::InFlightOp*) () #4 0x00005640c401855e in yb::client::(anonymous namespace)::FlushBatcherAsync(std::__1::shared_ptr<yb::client::internal::Batcher> const&, boost::function<void (yb::client::FlushStatus*)>, yb::client::YBSession::BatcherConfig, yb::StronglyTypedBool<yb::client::internal::IsWithinTransactionRetry_Tag>) () #5 0x00005640c401aa76 in yb::client::(anonymous namespace)::BatcherFlushDone(std::__1::shared_ptr<yb::client::internal::Batcher> const&, yb::Status const&, boost::function<void (yb::client::FlushStatus*)>, yb::client::YBSession::BatcherConfig) () #6 0x00005640c401b371 in boost::detail::function::void_function_obj_invoker1<std::__1::__bind<void (*)(std::__1::shared_ptr<yb::client::internal::Batcher> const&, yb::Status const&, boost::function<void (yb::client::FlushStatus*)>, yb::client::YBSession::BatcherConfig), std::__1::shared_ptr<yb::client::internal::Batcher> const&, std::__1::placeholders::__ph<1> const&, boost::function<void (yb::client::FlushStatus*)>, yb::client::YBSession::BatcherConfig&>, void, yb::Status const&>::invoke(boost::detail::function::function_buffer&, yb::Status const&) () #7 0x00005640c3f70398 in yb::client::internal::Batcher::Run() () #8 0x00005640c3f72656 in yb::client::internal::Batcher::FlushFinished() () #9 0x00005640c3f74a4d in yb::client::internal::Batcher::TabletLookupFinished(yb::client::internal::InFlightOp*, yb::Result<scoped_refptr<yb::client::internal::RemoteTablet> >) () #10 0x00005640c3f759bc in std::__1::__function::__func<yb::client::internal::Batcher::LookupTabletFor(yb::client::internal::InFlightOp*)::$_0, std::__1::allocator<yb::client::internal::Batcher::LookupTabletFor(yb::client::internal::InFlightOp*)::$_0>, void (yb::Result<scoped_refptr<yb::client::internal::RemoteTablet> > const&)>::operator()(yb::Result<scoped_refptr<yb::client::internal::RemoteTablet> > const&) () #11 0x00005640c3fff05d in yb::client::internal::MetaCache::LookupTabletByKey(std::__1::shared_ptr<yb::client::YBTable> const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::chrono::time_point<yb::CoarseMonoClock, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000000000l> > >, std::__1::function<void (yb::Result<scoped_refptr<yb::client::internal::RemoteTablet> > const&)>, yb::StronglyTypedBool<yb::client::internal::FailOnPartitionListRefreshed_Tag>) () ** Is holding an exclusive lock in MetaCache::LookupTabletByKey/DoLookupTabletByKey ** #12 0x00005640c3f7549a in yb::client::internal::Batcher::LookupTabletFor(yb::client::internal::InFlightOp*) () #13 0x00005640c401855e in yb::client::(anonymous namespace)::FlushBatcherAsync(std::__1::shared_ptr<yb::client::internal::Batcher> const&, boost::function<void (yb::client::FlushStatus*)>, yb::client::YBSession::BatcherConfig, yb::StronglyTypedBool<yb::client::internal::IsWithinTransactionRetry_Tag>) () #14 0x00005640c4017130 in yb::client::YBSession::FlushAsync(boost::function<void (yb::client::FlushStatus*)>) () #15 0x00005640c5225a0c in yb::tserver::PgClientServiceImpl::Perform(yb::tserver::PgPerformRequestPB const*, yb::tserver::PgPerformResponsePB*, yb::rpc::RpcContext) () #16 0x00005640c51c4487 in std::__1::__function::__func<yb::tserver::PgClientServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_20, std::__1::allocator<yb::tserver::PgClientServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_20>, void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator()(std::__1::shared_ptr<yb::rpc::InboundCall>&&) () #17 0x00005640c51d374f in yb::tserver::PgClientServiceIf::Handle(std::__1::shared_ptr<yb::rpc::InboundCall>) () #18 0x00005640c4f5f420 in yb::rpc::ServicePoolImpl::Handle(std::__1::shared_ptr<yb::rpc::InboundCall>) () #19 0x00005640c4e845af in yb::rpc::InboundCall::InboundCallTask::Run() () #20 0x00005640c4f6e243 in yb::rpc::(anonymous namespace)::Worker::Execute() () #21 0x00005640c570ecb4 in yb::Thread::SuperviseThread(void*) () #22 0x00007f808b7c6694 in start_thread (arg=0x7f76d8caf700) at pthread_create.c:333 #23 0x00007f808bac341d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 ``` Jira: DB-12651 Test Plan: Jenkins yb_build.sh --cxx-test ql-stress-test QLStressTest.ReproMetaCacheDeadlock Reviewers: rthallam, hsunder, qhu, timur Reviewed By: rthallam Subscribers: ybase, svc_phabricator Differential Revision: https://phorge.dev.yugabyte.com/D37788
- Loading branch information