diff --git a/Cargo.lock b/Cargo.lock index e6096f45511dcf..3ebc6829b7c345 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -897,6 +897,7 @@ dependencies = [ "aptos-safety-rules", "aptos-schemadb", "aptos-secure-storage", + "aptos-short-hex-str", "aptos-storage-interface", "aptos-temppath", "aptos-time-service", diff --git a/consensus/Cargo.toml b/consensus/Cargo.toml index 2aa1ee436769cc..667b1afb131886 100644 --- a/consensus/Cargo.toml +++ b/consensus/Cargo.toml @@ -42,6 +42,7 @@ aptos-runtimes = { workspace = true } aptos-safety-rules = { workspace = true } aptos-schemadb = { workspace = true } aptos-secure-storage = { workspace = true } +aptos-short-hex-str = { workspace = true } aptos-storage-interface = { workspace = true } aptos-temppath = { workspace = true } aptos-time-service = { workspace = true } diff --git a/consensus/src/counters.rs b/consensus/src/counters.rs index 5f655d5a69909f..f0bdd91f998bc6 100644 --- a/consensus/src/counters.rs +++ b/consensus/src/counters.rs @@ -592,6 +592,26 @@ pub static TIMEOUT_ROUNDS_COUNT: Lazy = Lazy::new(|| { .unwrap() }); +/// Count of the round timeout by reason and by whether the aggregator is the next proposer. +pub static ROUND_TIMEOUT_REASON: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "aptos_consensus_round_timeout_reason", + "Count of round timeouts by reason", + &["reason", "is_next_proposer"], + ) + .unwrap() +}); + +/// Count of the missing authors if any reported in the round timeout reason +pub static ROUND_TIMEOUT_REASON_MISSING_AUTHORS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "aptos_consensus_round_timeout_reason_missing_authors", + "Count of missing authors in round timeout reason", + &["author"], + ) + .unwrap() +}); + /// Count the number of timeouts a node experienced since last restart (close to 0 in happy path). /// This count is different from `TIMEOUT_ROUNDS_COUNT`, because not every time a node has /// a timeout there is an ultimate decision to move to the next round (it might take multiple diff --git a/consensus/src/round_manager.rs b/consensus/src/round_manager.rs index 3c8cdb6993159a..d775819ce0009a 100644 --- a/consensus/src/round_manager.rs +++ b/consensus/src/round_manager.rs @@ -60,6 +60,7 @@ use aptos_logger::prelude::*; #[cfg(test)] use aptos_safety_rules::ConsensusState; use aptos_safety_rules::TSafetyRules; +use aptos_short_hex_str::AsShortHexStr; use aptos_types::{ block_info::BlockInfo, epoch_state::EpochState, @@ -354,14 +355,34 @@ impl RoundManager { &mut self, new_round_event: NewRoundEvent, ) -> anyhow::Result<()> { + let is_current_proposer = self + .proposer_election + .is_valid_proposer(self.proposal_generator.author(), new_round_event.round); + counters::CURRENT_ROUND.set(new_round_event.round as i64); counters::ROUND_TIMEOUT_MS.set(new_round_event.timeout.as_millis() as i64); match new_round_event.reason { NewRoundReason::QCReady => { counters::QC_ROUNDS_COUNT.inc(); }, - NewRoundReason::Timeout(_) => { + NewRoundReason::Timeout(ref reason) => { counters::TIMEOUT_ROUNDS_COUNT.inc(); + counters::ROUND_TIMEOUT_REASON + .with_label_values(&[&reason.to_string(), &is_current_proposer.to_string()]) + .inc(); + if is_current_proposer { + if let RoundTimeoutReason::PayloadUnavailable { missing_authors } = reason { + let ordered_peers = + self.epoch_state.verifier.get_ordered_account_addresses(); + for idx in missing_authors.iter_ones() { + if let Some(author) = ordered_peers.get(idx) { + counters::ROUND_TIMEOUT_REASON_MISSING_AUTHORS + .with_label_values(&[author.short_str().as_str()]) + .inc(); + } + } + } + } }, }; info!( @@ -374,10 +395,7 @@ impl RoundManager { self.proposal_status_tracker .push(new_round_event.reason.clone()); - if self - .proposer_election - .is_valid_proposer(self.proposal_generator.author(), new_round_event.round) - { + if is_current_proposer { let epoch_state = self.epoch_state.clone(); let network = self.network.clone(); let sync_info = self.block_store.sync_info();