From 3271b83d4ccd2e64e71591b2992bbf322de99525 Mon Sep 17 00:00:00 2001 From: Will Hickey Date: Tue, 11 Oct 2022 12:06:33 -0500 Subject: [PATCH] Consensus Logging (#28176) (#28346) * Consensus Logging (#28176) * dereference bank_slot Co-authored-by: carllin --- core/src/cluster_slot_state_verifier.rs | 43 ++++++++++++++++++++++++- core/src/replay_stage.rs | 16 +++++++-- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/core/src/cluster_slot_state_verifier.rs b/core/src/cluster_slot_state_verifier.rs index d1d01a121eebb5..0bd07ae6e46ebb 100644 --- a/core/src/cluster_slot_state_verifier.rs +++ b/core/src/cluster_slot_state_verifier.rs @@ -766,11 +766,33 @@ pub(crate) fn check_slot_agrees_with_cluster( // Needs to happen before the bank_frozen_hash.is_none() check below to account for duplicate // signals arriving before the bank is constructed in replay. - if matches!(slot_state_update, SlotStateUpdate::Duplicate(_)) { + if let SlotStateUpdate::Duplicate(ref state) = slot_state_update { // If this slot has already been processed before, return if !duplicate_slots_tracker.insert(slot) { return; } + + datapoint_info!( + "duplicate_slot", + ("slot", slot, i64), + ( + "duplicate_confirmed_hash", + state + .duplicate_confirmed_hash + .unwrap_or_default() + .to_string(), + String + ), + ( + "my_hash", + state + .bank_status + .bank_hash() + .unwrap_or_default() + .to_string(), + String + ), + ); } // Avoid duplicate work from multiple of the same DuplicateConfirmed signal. This can @@ -781,6 +803,25 @@ pub(crate) fn check_slot_agrees_with_cluster( return; } } + + datapoint_info!( + "duplicate_confirmed_slot", + ("slot", slot, i64), + ( + "duplicate_confirmed_hash", + state.duplicate_confirmed_hash.to_string(), + String + ), + ( + "my_hash", + state + .bank_status + .bank_hash() + .unwrap_or_default() + .to_string(), + String + ), + ); } if let SlotStateUpdate::EpochSlotsFrozen(epoch_slots_frozen_state) = &slot_state_update { diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index 9c0e0e46c72885..4ac9858f735052 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -2269,6 +2269,11 @@ impl ReplayStage { transaction_status_sender.send_transaction_status_freeze_message(&bank); } bank.freeze(); + datapoint_info!( + "bank_frozen", + ("slot", *bank_slot, i64), + ("hash", bank.hash().to_string(), String), + ); // report cost tracker stats cost_update_sender .send(CostUpdate::FrozenBank { bank: bank.clone() }) @@ -2631,7 +2636,7 @@ impl ReplayStage { ); match switch_fork_decision { - SwitchForkDecision::FailedSwitchThreshold(_, _) => { + SwitchForkDecision::FailedSwitchThreshold(switch_proof_stake, total_stake) => { let reset_bank = heaviest_bank_on_same_voted_fork; // If we can't switch and our last vote was on a non-duplicate/confirmed slot, then // reset to the the next votable bank on the same fork as our last vote, @@ -2655,9 +2660,16 @@ impl ReplayStage { // then there will be no blocks to include the votes for slot 4, and the network halts // because 90% of validators can't vote info!( - "Waiting to switch vote to {}, resetting to slot {:?} for now", + "Waiting to switch vote to {}, + resetting to slot {:?} for now, + switch proof stake: {}, + threshold stake: {}, + total stake: {}", heaviest_bank.slot(), reset_bank.as_ref().map(|b| b.slot()), + switch_proof_stake, + total_stake as f64 * SWITCH_FORK_THRESHOLD, + total_stake ); failure_reasons.push(HeaviestForkFailures::FailedSwitchThreshold( heaviest_bank.slot(),