From 2f9afb3ce76d6494f823ff5171a22b010794e8d2 Mon Sep 17 00:00:00 2001 From: Sandeep L <99765181+lingamsandeep@users.noreply.github.com> Date: Thu, 22 Feb 2024 12:02:21 -0800 Subject: [PATCH] [BACKPORT 2.20][#21096] DocDB: Avoid fatal due to narrow_cast during ListTabletServers Summary: Original commit: aa2efd7e8258fc2b6aaf4468b8d6b5e899b4ba14 / D32496 During ListTabletServers, we occasionally experience a FATAL with the following stack whenever the last heartbeat was > 24 days ago. While this is a remote possibility, it is still a possibility. So as part of ListTabletServers, if the last heartbeat is more than int32 max milliseconds, we just set it to int32 max. ``` F20240207 16:26:17 ../../src/yb/gutil/casts.cc:21] Bad narrow cast: 2205994749 > 2147483647 @ 0x55e4d34d1257 google::LogMessage::SendToLog() @ 0x55e4d34d219d google::LogMessage::Flush() @ 0x55e4d34d2819 google::LogMessageFatal::~LogMessageFatal() @ 0x55e4d3cd4c69 yb::BadNarrowCast() @ 0x55e4d3741898 yb::narrow_cast<>() @ 0x55e4d3f163aa yb::master::(anonymous namespace)::MasterClusterServiceImpl::ListTabletServers() @ 0x55e4d414a455 std::__1::__function::__func<>::operator()() @ 0x55e4d414b33f yb::master::MasterClusterIf::Handle() @ 0x55e4d44aaeda yb::rpc::ServicePoolImpl::Handle() @ 0x55e4d43ea97f yb::rpc::InboundCall::InboundCallTask::Run() @ 0x55e4d44b9a73 yb::rpc::(anonymous namespace)::Worker::Execute() @ 0x55e4d4b8ab02 yb::Thread::SuperviseThread() @ 0x7f8ebad27694 start_thread @ 0x7f8ebb22941d __clone ``` Jira: DB-10056 Test Plan: MasterTest.TestRegisterAndHeartbeat Reviewers: bkolagani, arybochkin Reviewed By: bkolagani Subscribers: bogdan, ybase Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D32600 --- src/yb/master/master_cluster_service.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/yb/master/master_cluster_service.cc b/src/yb/master/master_cluster_service.cc index 3c56add62273..862a09c47cfe 100644 --- a/src/yb/master/master_cluster_service.cc +++ b/src/yb/master/master_cluster_service.cc @@ -137,8 +137,14 @@ class MasterClusterServiceImpl : public MasterServiceBase, public MasterClusterI *entry->mutable_registration() = std::move(*ts_info.mutable_registration()); auto last_heartbeat = desc->LastHeartbeatTime(); if (last_heartbeat) { - entry->set_millis_since_heartbeat(narrow_cast( - MonoTime::Now().GetDeltaSince(last_heartbeat).ToMilliseconds())); + auto ms_since_heartbeat = MonoTime::Now().GetDeltaSince(last_heartbeat).ToMilliseconds(); + if (ms_since_heartbeat > std::numeric_limits::max()) { + LOG(DFATAL) << entry->instance_id().permanent_uuid() + << " has not heartbeated since " + << ms_since_heartbeat; + ms_since_heartbeat = std::numeric_limits::max(); + } + entry->set_millis_since_heartbeat(narrow_cast(ms_since_heartbeat)); } entry->set_alive(desc->IsLive()); desc->GetMetrics(entry->mutable_metrics());