From d390fde92b09ec84ae7cf690754679b778f3eebc Mon Sep 17 00:00:00 2001 From: Luke Iannucci Date: Wed, 11 Sep 2024 15:32:19 -0400 Subject: [PATCH 1/6] add logic to remove from failed views if node decides on view after timeout --- crates/testing/src/overall_safety_task.rs | 58 +++++++++++++++++++---- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/crates/testing/src/overall_safety_task.rs b/crates/testing/src/overall_safety_task.rs index 5aac9086d3..f63415a963 100644 --- a/crates/testing/src/overall_safety_task.rs +++ b/crates/testing/src/overall_safety_task.rs @@ -165,8 +165,19 @@ impl, V: Versions> TestTas let paired_up = (leaf_chain.to_vec(), (*qc).clone()); match self.ctx.round_results.entry(view_number) { Entry::Occupied(mut o) => { - o.get_mut() - .insert_into_result(id, paired_up, maybe_block_size) + let entry = o.get_mut(); + let leaf = entry.insert_into_result(id, paired_up, maybe_block_size); + + // Here we noticed is a node may start up and time out waiting for a proposal + // So we add the timeout to failed_views, but eventually the proposal is received we decide on the view + // If we do indeed have a view timeout for the node at this point we want to remove it + entry.cleanup_previous_timeouts_on_view( + &mut self.ctx.failed_views, + &view_number, + &(id as u64), + ); + + leaf } Entry::Vacant(v) => { let mut round_result = RoundResult::default(); @@ -249,12 +260,9 @@ impl, V: Versions> TestTas expected_views_to_fail, }: OverallSafetyPropertiesDescription = self.properties.clone(); - let num_incomplete_views = self - .ctx - .round_results - .len() - .saturating_sub(self.ctx.successful_views.len()) - .saturating_sub(self.ctx.failed_views.len()); + let num_incomplete_views = self.ctx.round_results.len() + - self.ctx.successful_views.len() + - self.ctx.failed_views.len(); if self.ctx.successful_views.len() < num_successful_views { return TestResult::Fail(Box::new(OverallSafetyTaskErr::::NotEnoughDecides { @@ -540,6 +548,40 @@ impl RoundResult { } leaves } + + fn cleanup_previous_timeouts_on_view( + &mut self, + failed_views: &mut HashSet, + view_number: &TYPES::Time, + id: &u64, + ) { + // check if this node had a previous timeout + match self.failed_nodes.get(id) { + Some(error) => match error.as_ref() { + HotShotError::ViewTimeoutError { + view_number, + state: _, + } => { + tracing::debug!( + "Node {} originally timeout for view: {:?}. It has now been decided on.", + id, + view_number + ); + self.failed_nodes.remove(id); + } + _ => return, + }, + None => return, + } + + // check if no more failed nodes + if self.failed_nodes.is_empty() && failed_views.remove(view_number) { + tracing::debug!( + "Removed view {:?} from failed views, all nodes have agreed upon view.", + view_number + ); + } + } } /// cross node safety properties From be0837505df6fabf4998d8e81f8e680213706f7b Mon Sep 17 00:00:00 2001 From: Luke Iannucci Date: Thu, 12 Sep 2024 09:48:20 -0400 Subject: [PATCH 2/6] test ci for new failure --- crates/testing/src/overall_safety_task.rs | 25 +++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/crates/testing/src/overall_safety_task.rs b/crates/testing/src/overall_safety_task.rs index f63415a963..cb4e5c4536 100644 --- a/crates/testing/src/overall_safety_task.rs +++ b/crates/testing/src/overall_safety_task.rs @@ -82,6 +82,12 @@ pub enum OverallSafetyTaskErr { expected_failed_views: HashSet, actual_failed_views: HashSet, }, + /// This is a case where we have too many failured + succesful views over round results + /// This should never be the case and requires debugging if we see this get thrown + NotEnoughRoundResults { + results_count: usize, + views_count: usize, + }, } /// Data availability task state @@ -260,9 +266,20 @@ impl, V: Versions> TestTas expected_views_to_fail, }: OverallSafetyPropertiesDescription = self.properties.clone(); - let num_incomplete_views = self.ctx.round_results.len() - - self.ctx.successful_views.len() - - self.ctx.failed_views.len(); + let views_count = self.ctx.failed_views.len() + self.ctx.successful_views.len(); + let results_count = self.ctx.round_results.len(); + + // This can cause tests to crash if we do the subtracting to get `num_incomplete_views` below, so throw an error instead + // Use this instead of saturating_sub as that could hide a real problem + if views_count > results_count { + return TestResult::Fail(Box::new( + OverallSafetyTaskErr::::NotEnoughRoundResults { + results_count: results_count, + views_count: views_count, + }, + )); + } + let num_incomplete_views = views_count - results_count; if self.ctx.successful_views.len() < num_successful_views { return TestResult::Fail(Box::new(OverallSafetyTaskErr::::NotEnoughDecides { @@ -575,7 +592,7 @@ impl RoundResult { } // check if no more failed nodes - if self.failed_nodes.is_empty() && failed_views.remove(view_number) { + if self.failed_nodes.is_empty() && failed_views.contains(view_number) { tracing::debug!( "Removed view {:?} from failed views, all nodes have agreed upon view.", view_number From 5f731dcf2323436853e7cd0d6125ada9a5eb0dcc Mon Sep 17 00:00:00 2001 From: Luke Iannucci Date: Thu, 12 Sep 2024 10:05:19 -0400 Subject: [PATCH 3/6] subtract properly --- crates/testing/src/overall_safety_task.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/testing/src/overall_safety_task.rs b/crates/testing/src/overall_safety_task.rs index cb4e5c4536..82f42b034b 100644 --- a/crates/testing/src/overall_safety_task.rs +++ b/crates/testing/src/overall_safety_task.rs @@ -82,7 +82,7 @@ pub enum OverallSafetyTaskErr { expected_failed_views: HashSet, actual_failed_views: HashSet, }, - /// This is a case where we have too many failured + succesful views over round results + /// This is a case where we have too many failed + succesful views over round results /// This should never be the case and requires debugging if we see this get thrown NotEnoughRoundResults { results_count: usize, @@ -274,12 +274,12 @@ impl, V: Versions> TestTas if views_count > results_count { return TestResult::Fail(Box::new( OverallSafetyTaskErr::::NotEnoughRoundResults { - results_count: results_count, - views_count: views_count, + results_count, + views_count, }, )); } - let num_incomplete_views = views_count - results_count; + let num_incomplete_views = results_count - views_count; if self.ctx.successful_views.len() < num_successful_views { return TestResult::Fail(Box::new(OverallSafetyTaskErr::::NotEnoughDecides { From ac5a93a775dd477eb795878d5dd7403ffb03d5cc Mon Sep 17 00:00:00 2001 From: Luke Iannucci Date: Thu, 12 Sep 2024 10:27:39 -0400 Subject: [PATCH 4/6] cleanup --- crates/testing/src/overall_safety_task.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/crates/testing/src/overall_safety_task.rs b/crates/testing/src/overall_safety_task.rs index 82f42b034b..2a7066b07c 100644 --- a/crates/testing/src/overall_safety_task.rs +++ b/crates/testing/src/overall_safety_task.rs @@ -269,8 +269,9 @@ impl, V: Versions> TestTas let views_count = self.ctx.failed_views.len() + self.ctx.successful_views.len(); let results_count = self.ctx.round_results.len(); - // This can cause tests to crash if we do the subtracting to get `num_incomplete_views` below, so throw an error instead - // Use this instead of saturating_sub as that could hide a real problem + // This can cause tests to crash if we do the subtracting to get `num_incomplete_views` below + // So lets fail return an error instead + // Use this check instead of saturating_sub as that could hide a real problem if views_count > results_count { return TestResult::Fail(Box::new( OverallSafetyTaskErr::::NotEnoughRoundResults { @@ -592,7 +593,7 @@ impl RoundResult { } // check if no more failed nodes - if self.failed_nodes.is_empty() && failed_views.contains(view_number) { + if self.failed_nodes.is_empty() && failed_views.remove(view_number) { tracing::debug!( "Removed view {:?} from failed views, all nodes have agreed upon view.", view_number From 8abaa2cccce246ba61eb0ec4686b3055cbb7083e Mon Sep 17 00:00:00 2001 From: Luke Iannucci Date: Thu, 12 Sep 2024 11:02:23 -0400 Subject: [PATCH 5/6] spin up node a few views earlier --- crates/testing/tests/tests_2/catchup.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/testing/tests/tests_2/catchup.rs b/crates/testing/tests/tests_2/catchup.rs index 16d7e530ea..125812c710 100644 --- a/crates/testing/tests/tests_2/catchup.rs +++ b/crates/testing/tests/tests_2/catchup.rs @@ -41,7 +41,7 @@ async fn test_catchup() { metadata.spinning_properties = SpinningTaskDescription { // Start the nodes before their leadership. - node_changes: vec![(13, catchup_node)], + node_changes: vec![(8, catchup_node)], }; metadata.completion_task_description = From cb2c538eef5536a91766488185c3d92cb50e5a51 Mon Sep 17 00:00:00 2001 From: Luke Iannucci Date: Thu, 12 Sep 2024 12:10:19 -0400 Subject: [PATCH 6/6] increase round start delay --- crates/testing/tests/tests_2/catchup.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/testing/tests/tests_2/catchup.rs b/crates/testing/tests/tests_2/catchup.rs index 125812c710..8253ae8084 100644 --- a/crates/testing/tests/tests_2/catchup.rs +++ b/crates/testing/tests/tests_2/catchup.rs @@ -23,6 +23,9 @@ async fn test_catchup() { async_compatibility_layer::logging::setup_backtrace(); let timing_data = TimingData { next_view_timeout: 2000, + // increase the round delay for this test + // TODO: remove this delay increase for test - https://github.com/EspressoSystems/HotShot/issues/3673 + round_start_delay: 200, ..Default::default() }; let mut metadata: TestDescription = @@ -41,7 +44,7 @@ async fn test_catchup() { metadata.spinning_properties = SpinningTaskDescription { // Start the nodes before their leadership. - node_changes: vec![(8, catchup_node)], + node_changes: vec![(13, catchup_node)], }; metadata.completion_task_description =