From 09427b22137671f11003a66b71303132439beb0c Mon Sep 17 00:00:00 2001 From: Balaji Arun Date: Tue, 15 Oct 2024 21:37:15 -0700 Subject: [PATCH] [consensus] enable round timeout message (#14914) * [consensus] enable round timeout message * [consensus] counters to track round timeout reason --- Cargo.lock | 1 + config/src/config/consensus_config.rs | 2 +- consensus/Cargo.toml | 1 + consensus/src/counters.rs | 20 +++++++ consensus/src/round_manager.rs | 28 ++++++++-- consensus/src/round_manager_test.rs | 76 +++++++++++++++++++++++++-- 6 files changed, 117 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index db0353c542f49..63e9f4022a8c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -897,6 +897,7 @@ dependencies = [ "aptos-safety-rules", "aptos-schemadb", "aptos-secure-storage", + "aptos-short-hex-str", "aptos-storage-interface", "aptos-temppath", "aptos-time-service", diff --git a/config/src/config/consensus_config.rs b/config/src/config/consensus_config.rs index f907072edfb67..93a3fc15c59cd 100644 --- a/config/src/config/consensus_config.rs +++ b/config/src/config/consensus_config.rs @@ -322,7 +322,7 @@ impl Default for ConsensusConfig { enable_pre_commit: true, max_pending_rounds_in_commit_vote_cache: 100, optimistic_sig_verification: false, - enable_round_timeout_msg: false, + enable_round_timeout_msg: true, } } } diff --git a/consensus/Cargo.toml b/consensus/Cargo.toml index 2aa1ee436769c..667b1afb13188 100644 --- a/consensus/Cargo.toml +++ b/consensus/Cargo.toml @@ -42,6 +42,7 @@ aptos-runtimes = { workspace = true } aptos-safety-rules = { workspace = true } aptos-schemadb = { workspace = true } aptos-secure-storage = { workspace = true } +aptos-short-hex-str = { workspace = true } aptos-storage-interface = { workspace = true } aptos-temppath = { workspace = true } aptos-time-service = { workspace = true } diff --git a/consensus/src/counters.rs b/consensus/src/counters.rs index 5f655d5a69909..277fe38ed2866 100644 --- a/consensus/src/counters.rs +++ b/consensus/src/counters.rs @@ -592,6 +592,26 @@ pub static TIMEOUT_ROUNDS_COUNT: Lazy = Lazy::new(|| { .unwrap() }); +/// Count of the round timeout by reason and by whether the aggregator is the next proposer. +pub static AGGREGATED_ROUND_TIMEOUT_REASON: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "aptos_consensus_agg_round_timeout_reason", + "Count of round timeouts by reason", + &["reason", "is_next_proposer"], + ) + .unwrap() +}); + +/// Count of the missing authors if any reported in the round timeout reason +pub static AGGREGATED_ROUND_TIMEOUT_REASON_MISSING_AUTHORS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "aptos_consensus_agg_round_timeout_reason_missing_authors", + "Count of missing authors in round timeout reason", + &["author"], + ) + .unwrap() +}); + /// Count the number of timeouts a node experienced since last restart (close to 0 in happy path). /// This count is different from `TIMEOUT_ROUNDS_COUNT`, because not every time a node has /// a timeout there is an ultimate decision to move to the next round (it might take multiple diff --git a/consensus/src/round_manager.rs b/consensus/src/round_manager.rs index 3c8cdb6993159..24d6b9901d3f2 100644 --- a/consensus/src/round_manager.rs +++ b/consensus/src/round_manager.rs @@ -60,6 +60,7 @@ use aptos_logger::prelude::*; #[cfg(test)] use aptos_safety_rules::ConsensusState; use aptos_safety_rules::TSafetyRules; +use aptos_short_hex_str::AsShortHexStr; use aptos_types::{ block_info::BlockInfo, epoch_state::EpochState, @@ -354,14 +355,34 @@ impl RoundManager { &mut self, new_round_event: NewRoundEvent, ) -> anyhow::Result<()> { + let is_current_proposer = self + .proposer_election + .is_valid_proposer(self.proposal_generator.author(), new_round_event.round); + counters::CURRENT_ROUND.set(new_round_event.round as i64); counters::ROUND_TIMEOUT_MS.set(new_round_event.timeout.as_millis() as i64); match new_round_event.reason { NewRoundReason::QCReady => { counters::QC_ROUNDS_COUNT.inc(); }, - NewRoundReason::Timeout(_) => { + NewRoundReason::Timeout(ref reason) => { counters::TIMEOUT_ROUNDS_COUNT.inc(); + counters::AGGREGATED_ROUND_TIMEOUT_REASON + .with_label_values(&[&reason.to_string(), &is_current_proposer.to_string()]) + .inc(); + if is_current_proposer { + if let RoundTimeoutReason::PayloadUnavailable { missing_authors } = reason { + let ordered_peers = + self.epoch_state.verifier.get_ordered_account_addresses(); + for idx in missing_authors.iter_ones() { + if let Some(author) = ordered_peers.get(idx) { + counters::AGGREGATED_ROUND_TIMEOUT_REASON_MISSING_AUTHORS + .with_label_values(&[author.short_str().as_str()]) + .inc(); + } + } + } + } }, }; info!( @@ -374,10 +395,7 @@ impl RoundManager { self.proposal_status_tracker .push(new_round_event.reason.clone()); - if self - .proposer_election - .is_valid_proposer(self.proposal_generator.author(), new_round_event.round) - { + if is_current_proposer { let epoch_state = self.epoch_state.clone(); let network = self.network.clone(); let sync_info = self.block_store.sync_info(); diff --git a/consensus/src/round_manager_test.rs b/consensus/src/round_manager_test.rs index 29716947991bd..7bccb2747163e 100644 --- a/consensus/src/round_manager_test.rs +++ b/consensus/src/round_manager_test.rs @@ -524,6 +524,14 @@ impl NodeSetup { } } +fn config_with_round_timeout_msg_disabled() -> ConsensusConfig { + // Disable RoundTimeoutMsg to unless expliclity enabled. + ConsensusConfig { + enable_round_timeout_msg: false, + ..Default::default() + } +} + fn start_replying_to_block_retreival(nodes: Vec) -> ReplyingRPCHandle { let done = Arc::new(AtomicBool::new(false)); let mut handles = Vec::new(); @@ -966,7 +974,7 @@ fn sync_info_carried_on_timeout_vote() { 1, None, None, - None, + Some(config_with_round_timeout_msg_disabled()), None, None, ); @@ -1470,7 +1478,7 @@ fn nil_vote_on_timeout() { 1, None, None, - None, + Some(config_with_round_timeout_msg_disabled()), None, None, ); @@ -1553,7 +1561,7 @@ fn vote_resent_on_timeout() { 1, None, None, - None, + Some(config_with_round_timeout_msg_disabled()), None, None, ); @@ -1706,7 +1714,7 @@ fn safety_rules_crash() { 1, None, None, - None, + Some(config_with_round_timeout_msg_disabled()), None, None, ); @@ -1772,7 +1780,7 @@ fn echo_timeout() { 4, None, None, - None, + Some(config_with_round_timeout_msg_disabled()), None, None, ); @@ -1825,6 +1833,64 @@ fn echo_timeout() { }); } +#[test] +fn echo_round_timeout_msg() { + let runtime = consensus_runtime(); + let mut playground = NetworkPlayground::new(runtime.handle().clone()); + let mut nodes = NodeSetup::create_nodes( + &mut playground, + runtime.handle().clone(), + 4, + None, + None, + None, + None, + None, + ); + runtime.spawn(playground.start()); + timed_block_on(&runtime, async { + // clear the message queue + for node in &mut nodes { + node.next_proposal().await; + } + // timeout 3 nodes + for node in &mut nodes[1..] { + node.round_manager + .process_local_timeout(1) + .await + .unwrap_err(); + } + let node_0 = &mut nodes[0]; + // node 0 doesn't timeout and should echo the timeout after 2 timeout message + for i in 0..3 { + let timeout_vote = node_0.next_timeout().await; + let result = node_0 + .round_manager + .process_round_timeout_msg(timeout_vote) + .await; + // first and third message should not timeout + if i == 0 || i == 2 { + assert!(result.is_ok()); + } + if i == 1 { + // timeout is an Error + assert!(result.is_err()); + } + } + + let node_1 = &mut nodes[1]; + // it receives 4 timeout messages (1 from each) and doesn't echo since it already timeout + for _ in 0..4 { + let timeout_vote = node_1.next_timeout().await; + node_1 + .round_manager + .process_round_timeout_msg(timeout_vote) + .await + .unwrap(); + } + }); +} + #[test] fn no_next_test() { let runtime = consensus_runtime();