From 7879bf000e1292f4e1f4cc35644564e53e301bcd Mon Sep 17 00:00:00 2001 From: lukeiannucci Date: Thu, 12 Sep 2024 12:29:33 -0400 Subject: [PATCH] [TESTING] - Fix Startup / Restart Test Failures (#3669) * add logic to remove from failed views if node decides on view after timeout * test ci for new failure * subtract properly * cleanup * spin up node a few views earlier * increase round start delay --- crates/testing/src/overall_safety_task.rs | 76 ++++++++++++++++++++--- crates/testing/tests/tests_2/catchup.rs | 3 + 2 files changed, 71 insertions(+), 8 deletions(-) diff --git a/crates/testing/src/overall_safety_task.rs b/crates/testing/src/overall_safety_task.rs index 5aac9086d3..2a7066b07c 100644 --- a/crates/testing/src/overall_safety_task.rs +++ b/crates/testing/src/overall_safety_task.rs @@ -82,6 +82,12 @@ pub enum OverallSafetyTaskErr { expected_failed_views: HashSet, actual_failed_views: HashSet, }, + /// This is a case where we have too many failed + succesful views over round results + /// This should never be the case and requires debugging if we see this get thrown + NotEnoughRoundResults { + results_count: usize, + views_count: usize, + }, } /// Data availability task state @@ -165,8 +171,19 @@ impl, V: Versions> TestTas let paired_up = (leaf_chain.to_vec(), (*qc).clone()); match self.ctx.round_results.entry(view_number) { Entry::Occupied(mut o) => { - o.get_mut() - .insert_into_result(id, paired_up, maybe_block_size) + let entry = o.get_mut(); + let leaf = entry.insert_into_result(id, paired_up, maybe_block_size); + + // Here we noticed is a node may start up and time out waiting for a proposal + // So we add the timeout to failed_views, but eventually the proposal is received we decide on the view + // If we do indeed have a view timeout for the node at this point we want to remove it + entry.cleanup_previous_timeouts_on_view( + &mut self.ctx.failed_views, + &view_number, + &(id as u64), + ); + + leaf } Entry::Vacant(v) => { let mut round_result = RoundResult::default(); @@ -249,12 +266,21 @@ impl, V: Versions> TestTas expected_views_to_fail, }: OverallSafetyPropertiesDescription = self.properties.clone(); - let num_incomplete_views = self - .ctx - .round_results - .len() - .saturating_sub(self.ctx.successful_views.len()) - .saturating_sub(self.ctx.failed_views.len()); + let views_count = self.ctx.failed_views.len() + self.ctx.successful_views.len(); + let results_count = self.ctx.round_results.len(); + + // This can cause tests to crash if we do the subtracting to get `num_incomplete_views` below + // So lets fail return an error instead + // Use this check instead of saturating_sub as that could hide a real problem + if views_count > results_count { + return TestResult::Fail(Box::new( + OverallSafetyTaskErr::::NotEnoughRoundResults { + results_count, + views_count, + }, + )); + } + let num_incomplete_views = results_count - views_count; if self.ctx.successful_views.len() < num_successful_views { return TestResult::Fail(Box::new(OverallSafetyTaskErr::::NotEnoughDecides { @@ -540,6 +566,40 @@ impl RoundResult { } leaves } + + fn cleanup_previous_timeouts_on_view( + &mut self, + failed_views: &mut HashSet, + view_number: &TYPES::Time, + id: &u64, + ) { + // check if this node had a previous timeout + match self.failed_nodes.get(id) { + Some(error) => match error.as_ref() { + HotShotError::ViewTimeoutError { + view_number, + state: _, + } => { + tracing::debug!( + "Node {} originally timeout for view: {:?}. It has now been decided on.", + id, + view_number + ); + self.failed_nodes.remove(id); + } + _ => return, + }, + None => return, + } + + // check if no more failed nodes + if self.failed_nodes.is_empty() && failed_views.remove(view_number) { + tracing::debug!( + "Removed view {:?} from failed views, all nodes have agreed upon view.", + view_number + ); + } + } } /// cross node safety properties diff --git a/crates/testing/tests/tests_2/catchup.rs b/crates/testing/tests/tests_2/catchup.rs index 16d7e530ea..8253ae8084 100644 --- a/crates/testing/tests/tests_2/catchup.rs +++ b/crates/testing/tests/tests_2/catchup.rs @@ -23,6 +23,9 @@ async fn test_catchup() { async_compatibility_layer::logging::setup_backtrace(); let timing_data = TimingData { next_view_timeout: 2000, + // increase the round delay for this test + // TODO: remove this delay increase for test - https://github.com/EspressoSystems/HotShot/issues/3673 + round_start_delay: 200, ..Default::default() }; let mut metadata: TestDescription =