From 985bb8d6dc338f0a9e8d28a537940412e01d3873 Mon Sep 17 00:00:00 2001 From: Zachary Drudi Date: Wed, 25 Sep 2024 18:53:26 -0400 Subject: [PATCH] [BACKPORT 2024.2][#21178] docdb: Add metric for the max master follower heartbeat delay. Summary: Adds a metric for the maximum master follower heartbeat delay. Ideally this metric would report the raw follower delays directly and clients could calculate the maximum themselves. However there is no metric entity we could use to export a metric from the leader master with the right fields. Adding a new metric entity for just this metric seems too involved given the current use-case (alert if the follower delay is too large). Another alternative would be adding a server entity metric to each master, and have followers report their own delay. But we are interested in the perspective from the master leader here and that approach could introduce surprising behaviour. Jira: DB-10113 Original commit: 678d2771dee9d868b3c14fa7315c80f02fc1180e / D38319 Test Plan: ``` ./yb_build.sh --with-tests --cxx-test-filter-re tablet_health_manager-itest --cxx-test tablet_health_manager-itest --gtest_filter 'AreNodesSafeToTakeDownItest.GetFollowerUpdate*' ``` Reviewers: asrivastava, sanketh Reviewed By: asrivastava Subscribers: slingam, ybase Differential Revision: https://phorge.dev.yugabyte.com/D38441 --- .../tablet_health_manager-itest.cc | 65 ++++++++++++++++++- src/yb/master/catalog_manager.cc | 28 ++++++++ src/yb/master/catalog_manager.h | 2 + src/yb/util/monotime.cc | 6 ++ src/yb/util/monotime.h | 2 + 5 files changed, 101 insertions(+), 2 deletions(-) diff --git a/src/yb/integration-tests/tablet_health_manager-itest.cc b/src/yb/integration-tests/tablet_health_manager-itest.cc index dbb2a14b6d10..a4a8c569119e 100644 --- a/src/yb/integration-tests/tablet_health_manager-itest.cc +++ b/src/yb/integration-tests/tablet_health_manager-itest.cc @@ -12,6 +12,7 @@ // #include +#include #include #include @@ -29,6 +30,7 @@ #include "yb/tools/yb-admin_client.h" #include "yb/util/backoff_waiter.h" +#include "yb/util/curl_util.h" #include "yb/util/monotime.h" #include "yb/util/test_macros.h" #include "yb/util/tsan_util.h" @@ -36,6 +38,7 @@ DECLARE_int32(are_nodes_safe_to_take_down_timeout_buffer_ms); DECLARE_double(leader_failure_max_missed_heartbeat_periods); DECLARE_int32(raft_heartbeat_interval_ms); +DECLARE_int32(catalog_manager_bg_task_wait_ms); using std::string; using std::vector; @@ -43,6 +46,19 @@ using std::vector; namespace yb { namespace integration_tests { +template +Result parse(const std::string& s); + +template <> +Result parse(const std::string& s) { + std::size_t pos; + auto out = std::stoull(s, &pos); + if (pos != s.size()) { + return STATUS(InvalidArgument, "Couldn't parse string to unsigned integer: $0", s); + } + return out; +} + class AreNodesSafeToTakeDownItest : public YBTableTestBase { protected: void BeforeCreateTable() override { @@ -90,6 +106,32 @@ class AreNodesSafeToTakeDownItest : public YBTableTestBase { } const int kFollowerLagBoundMs = 1000 * kTimeMultiplier; + + template + Result GetLeaderMasterMetric(const std::string& metric_name) { + EasyCurl curl; + auto addr = external_mini_cluster_->GetLeaderMaster()->bound_http_hostport(); + faststring out; + RETURN_NOT_OK( + curl.FetchURL(Format("http://$0:$1/prometheus-metrics", addr.host(), addr.port()), &out)); + std::stringstream ss(out.ToString()); + for (std::string line; std::getline(ss, line, '\n');) { + if (!line.starts_with(metric_name)) { + continue; + } + std::vector words; + std::stringstream line_ss(line); + for (std::string word; std::getline(line_ss, word, ' ');) { + words.push_back(std::move(word)); + } + if (words.size() < 2) { + return STATUS_FORMAT(IllegalState, "Bad prometheus metric line: $0", line); + } + return parse(words[words.size() - 2]); + } + return STATUS_FORMAT( + NotFound, "Couldn't find metric $0 on prometheus-metrics page", metric_name); + } }; TEST_F(AreNodesSafeToTakeDownItest, HealthyCluster) { @@ -257,6 +299,9 @@ TEST_F(AreNodesSafeToTakeDownItest, GetFollowerUpdateDelay) { for (const auto& heartbeat_delay : resp.heartbeat_delay()) { ASSERT_LE(heartbeat_delay.last_heartbeat_delta_ms(), max_expected_heartbeat_time); } + auto max_actual_heartbeat_delay = + ASSERT_RESULT(GetLeaderMasterMetric("max_follower_heartbeat_delay")); + ASSERT_LE(max_actual_heartbeat_delay, max_expected_heartbeat_time); } // Stop one of the nodes and ensure its delay increases. @@ -271,7 +316,7 @@ TEST_F(AreNodesSafeToTakeDownItest, GetFollowerUpdateDelayWithStoppedNode) { ASSERT_NE(master_it, masters.end()); auto other_master = *master_it; other_master->Shutdown(); - auto sleep_time = FLAGS_raft_heartbeat_interval_ms * 3; + auto sleep_time = static_cast(FLAGS_raft_heartbeat_interval_ms * 3); SleepFor(MonoDelta::FromMilliseconds(sleep_time)); master::GetMasterHeartbeatDelaysRequestPB req; master::GetMasterHeartbeatDelaysResponsePB resp; @@ -286,7 +331,23 @@ TEST_F(AreNodesSafeToTakeDownItest, GetFollowerUpdateDelayWithStoppedNode) { ASSERT_GE(heartbeat_delay.last_heartbeat_delta_ms(), sleep_time); } } - + // Verify the heartbeat delay shows up in the metric max_follower_heartbeat_delay. This metric is + // updated by the catalog manager background task. The background task may not have run recently + // enough to pick up the delay so we retry a few times. + std::string failure_message; + ASSERT_OK(WaitFor( + [this, sleep_time, &failure_message]() -> Result { + auto max_actual_heartbeat_delay = + VERIFY_RESULT(GetLeaderMasterMetric("max_follower_heartbeat_delay")); + auto done = max_actual_heartbeat_delay >= sleep_time; + if (!done) { + failure_message = Format( + "Heartbeat delay from metric is $0, expecting it to be at least $1", + max_actual_heartbeat_delay, sleep_time); + } + return done; + }, + MonoDelta::FromMilliseconds(FLAGS_catalog_manager_bg_task_wait_ms * 3), failure_message)); ASSERT_OK(other_master->Restart()); } diff --git a/src/yb/master/catalog_manager.cc b/src/yb/master/catalog_manager.cc index 2d4a7dfe8ff8..25d8ab6b95ff 100644 --- a/src/yb/master/catalog_manager.cc +++ b/src/yb/master/catalog_manager.cc @@ -527,6 +527,15 @@ METRIC_DEFINE_gauge_uint32(cluster, num_tablet_servers_dead, "heartbeat in the time interval defined by the gflag " "FLAGS_tserver_unresponsive_timeout_ms."); +METRIC_DEFINE_gauge_uint64(cluster, max_follower_heartbeat_delay, "The maximum heartbeat delay " + "among all master followers", + yb::MetricUnit::kMilliseconds, + "The number of milliseconds since the master leader has ACK'd a " + "heartbeat from all peers of the system catalog tablet. The master " + "leader does not ACK heartbeats from peers that have fallen behind due " + "to WAL GC so such peers will cause this metric to increase " + "indefinitely."); + METRIC_DEFINE_counter(cluster, create_table_too_many_tablets, "How many CreateTable requests have failed due to too many tablets", yb::MetricUnit::kRequests, "The number of CreateTable request errors due to attempting to create too many tablets."); @@ -979,6 +988,9 @@ Status CatalogManager::Init() { metric_create_table_too_many_tablets_ = METRIC_create_table_too_many_tablets.Instantiate(master_->metric_entity_cluster()); + metric_max_follower_heartbeat_delay_ = + METRIC_max_follower_heartbeat_delay.Instantiate(master_->metric_entity_cluster(), 0); + cdc_state_table_ = std::make_unique(master_->cdc_state_client_future()); RETURN_NOT_OK(xcluster_manager_->Init()); @@ -11680,6 +11692,22 @@ void CatalogManager::ReportMetrics() { auto num_servers = master_->ts_manager()->NumDescriptors(); metric_num_tablet_servers_dead_->set_value( narrow_cast(num_servers - num_live_servers)); + + // Report the max master follower heartbeat delay. + auto consensus_result = tablet_peer()->GetConsensus(); + if (consensus_result) { + MonoTime earliest_time = MonoTime::kUninitialized; + for (const auto& last_communication_time : + (*consensus_result)->GetFollowerCommunicationTimes()) { + earliest_time.MakeAtMost(last_communication_time.last_successful_communication); + } + int64_t time_in_ms = + earliest_time ? MonoTime::Now().GetDeltaSince(earliest_time).ToMilliseconds() : 0; + if (time_in_ms < 0) { + time_in_ms = 0; + } + metric_max_follower_heartbeat_delay_->set_value(static_cast(time_in_ms)); + } } void CatalogManager::ResetMetrics() { diff --git a/src/yb/master/catalog_manager.h b/src/yb/master/catalog_manager.h index 9a6deb354bc7..023262c6713b 100644 --- a/src/yb/master/catalog_manager.h +++ b/src/yb/master/catalog_manager.h @@ -2379,6 +2379,8 @@ class CatalogManager : public tserver::TabletPeerLookupIf, scoped_refptr metric_create_table_too_many_tablets_; + scoped_refptr> metric_max_follower_heartbeat_delay_; + friend class ClusterLoadBalancer; // Policy for load balancing tablets on tablet servers. diff --git a/src/yb/util/monotime.cc b/src/yb/util/monotime.cc index c8cc1876267d..d083d8bf06e6 100644 --- a/src/yb/util/monotime.cc +++ b/src/yb/util/monotime.cc @@ -380,6 +380,12 @@ void MonoTime::MakeAtLeast(MonoTime rhs) { } } +void MonoTime::MakeAtMost(MonoTime rhs) { + if (rhs.Initialized() && (!Initialized() || value_ > rhs.value_)) { + value_ = rhs.value_; + } +} + // ------------------------------------------------------------------------------------------------ std::string FormatForComparisonFailureMessage(const MonoDelta& op, const MonoDelta& other) { diff --git a/src/yb/util/monotime.h b/src/yb/util/monotime.h index bc963536b262..1fdb52ba1aab 100644 --- a/src/yb/util/monotime.h +++ b/src/yb/util/monotime.h @@ -235,6 +235,8 @@ class MonoTime { // Set this time to the given value if it is lower than that or uninitialized. void MakeAtLeast(MonoTime rhs); + void MakeAtMost(MonoTime rhs); + TimePoint ToSteadyTimePoint() const { return value_; }