From 17b64005d3d77bdbf2c5f7f2002ce6351d8c59e3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekar Date: Tue, 27 Dec 2022 09:47:45 -0800 Subject: [PATCH] Add more logging and documentation to flaky optimistic confirmation tests (#29418) * Revert "add retry for flakey local cluster test (#29228)" This reverts commit 7a97121747946bf14dbb97c855a064720520bf15. * Add logging for repair --- core/src/serve_repair.rs | 5 ++ local-cluster/tests/local_cluster_flakey.rs | 62 ++++++--------------- 2 files changed, 21 insertions(+), 46 deletions(-) diff --git a/core/src/serve_repair.rs b/core/src/serve_repair.rs index 998b1da253a0a0..65c60f62d741be 100644 --- a/core/src/serve_repair.rs +++ b/core/src/serve_repair.rs @@ -926,6 +926,11 @@ impl ServeRepair { nonce, identity_keypair, )?; + debug!( + "Sending repair request from {} for {:#?}", + identity_keypair.pubkey(), + repair_request + ); Ok((addr, out)) } diff --git a/local-cluster/tests/local_cluster_flakey.rs b/local-cluster/tests/local_cluster_flakey.rs index 4f3a98463a8fed..20a0272a71ce93 100644 --- a/local-cluster/tests/local_cluster_flakey.rs +++ b/local-cluster/tests/local_cluster_flakey.rs @@ -38,43 +38,18 @@ fn test_optimistic_confirmation_violation_without_tower() { do_test_optimistic_confirmation_violation_with_or_without_tower(false); } -enum RunResult { - Success, - FailNoViolation, - FailViolation, -} - -fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: bool) { - let mut retry = 10; - while retry > 0 { - match do_test_optimistic_confirmation_violation_with_or_without_tower_inner(with_tower) { - RunResult::Success => { - return; - } - _ => { - retry -= 1; - } - } - } - panic!("optimistic confirmation violation with or without tower failed after 10 trial"); -} - // A bit convoluted test case; but this roughly follows this test theoretical scenario: +// Validator A, B, C have 31, 36, 33 % of stake respectively. Leader schedule is split, first half +// of the test B is always leader, second half C is. Additionally we have a non voting validator D with 0 +// stake to propagate gossip info. // -// Step 1: You have validator A + B with 31% and 36% of the stake. Run only validator B: -// -// S0 -> S1 -> S2 -> S3 (B vote) -// -// Step 2: Turn off B, and truncate the ledger after slot `S3` (simulate votes not -// landing in next slot). -// Copy ledger fully to validator A and validator C -// -// Step 3: Turn on A, and have it vote up to S3. Truncate anything past slot `S3`. +// Step 1: Kill C, only A, B and D should be running // // S0 -> S1 -> S2 -> S3 (A & B vote, optimistically confirmed) // -// Step 4: -// Start validator C with 33% of the stake with same ledger, but only up to slot S2. +// Step 2: +// Kill A and B once we verify that they have voted on S3 or beyond. Copy B's ledger to C but only +// up to slot S2 // Have `C` generate some blocks like: // // S0 -> S1 -> S2 -> S4 @@ -99,10 +74,8 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b // With the persisted tower: // `A` should not be able to generate a switching proof. // -fn do_test_optimistic_confirmation_violation_with_or_without_tower_inner( - with_tower: bool, -) -> RunResult { - solana_logger::setup_with_default(RUST_LOG_FILTER); +fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: bool) { + solana_logger::setup_with("debug"); // First set up the cluster with 4 nodes let slots_per_epoch = 2048; @@ -150,7 +123,7 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower_inner( // 2. Validator A doesn't vote past `next_slot_on_a` before we can kill it. This is essential // because if validator A votes past `next_slot_on_a`, and then we copy over validator B's ledger // below only for slots <= `next_slot_on_a`, validator A will not know how it's last vote chains - // to the otehr forks, and may violate switching proofs on restart. + // to the other forks, and may violate switching proofs on restart. let mut default_config = ValidatorConfig::default_for_test(); // Split leader schedule 50-50 between validators B and C, don't give validator A any slots because // it's going to be deleting its ledger, so may create versions of slots it's already created, but @@ -172,8 +145,9 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower_inner( let mut validator_configs = make_identical_validator_configs(&default_config, node_stakes.len()); - // Disable voting on validator C + // Disable voting on validators C, and D validator_configs[2].voting_disabled = true; + validator_configs[3].voting_disabled = true; let mut config = ClusterConfig { cluster_lamports: DEFAULT_CLUSTER_LAMPORTS + node_stakes.iter().sum::(), @@ -254,7 +228,7 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower_inner( let validator_a_info = cluster.exit_node(&validator_a_pubkey); // Step 2: - // Stop validator and truncate ledger, copy over B's ledger to C + // Truncate ledger, copy over B's ledger to C info!("Create validator C's ledger"); { // first copy from validator B's ledger @@ -290,10 +264,10 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower_inner( .unwrap() .0; assert!(last_vote_slot >= next_slot_on_a); + info!("Success, A voted on slot {}", last_vote_slot); { let blockstore = open_blockstore(&val_a_ledger_path); - purge_slots_with_count(&blockstore, next_slot_on_a + 1, truncated_slots); if !with_tower { info!("Removing tower!"); remove_tower(&val_a_ledger_path, &validator_a_pubkey); @@ -369,17 +343,13 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower_inner( let expects_optimistic_confirmation_violation = !with_tower; if bad_vote_detected != expects_optimistic_confirmation_violation { if bad_vote_detected { - error!("No violation expected because of persisted tower!"); - return RunResult::FailNoViolation; + panic!("No violation expected because of persisted tower!"); } else { - error!("Violation expected because of removed persisted tower!"); - return RunResult::FailViolation; + panic!("Violation expected because of removed persisted tower!"); } } else if bad_vote_detected { info!("THIS TEST expected violations. And indeed, there was some, because of removed persisted tower."); } else { info!("THIS TEST expected no violation. And indeed, there was none, thanks to persisted tower."); } - - RunResult::Success }