From d390fde92b09ec84ae7cf690754679b778f3eebc Mon Sep 17 00:00:00 2001
From: Luke Iannucci <luke.iannucci@outlook.com>
Date: Wed, 11 Sep 2024 15:32:19 -0400
Subject: [PATCH 1/6] add logic to remove from failed views if node decides on
 view after timeout

---
 crates/testing/src/overall_safety_task.rs | 58 +++++++++++++++++++----
 1 file changed, 50 insertions(+), 8 deletions(-)
diff --git a/crates/testing/src/overall_safety_task.rs b/crates/testing/src/overall_safety_task.rs
index 5aac9086d3..f63415a963 100644
--- a/crates/testing/src/overall_safety_task.rs
+++ b/crates/testing/src/overall_safety_task.rs
@@ -165,8 +165,19 @@ impl<TYPES: NodeType, I: TestableNodeImplementation<TYPES>, V: Versions> TestTas
                 let paired_up = (leaf_chain.to_vec(), (*qc).clone());
                 match self.ctx.round_results.entry(view_number) {
                     Entry::Occupied(mut o) => {
-                        o.get_mut()
-                            .insert_into_result(id, paired_up, maybe_block_size)
+                        let entry = o.get_mut();
+                        let leaf = entry.insert_into_result(id, paired_up, maybe_block_size);
+
+                        // Here we noticed is a node may start up and time out waiting for a proposal
+                        // So we add the timeout to failed_views, but eventually the proposal is received we decide on the view
+                        // If we do indeed have a view timeout for the node at this point we want to remove it
+                        entry.cleanup_previous_timeouts_on_view(
+                            &mut self.ctx.failed_views,
+                            &view_number,
+                            &(id as u64),
+                        );
+
+                        leaf
                     }
                     Entry::Vacant(v) => {
                         let mut round_result = RoundResult::default();
@@ -249,12 +260,9 @@ impl<TYPES: NodeType, I: TestableNodeImplementation<TYPES>, V: Versions> TestTas
             expected_views_to_fail,
         }: OverallSafetyPropertiesDescription<TYPES> = self.properties.clone();
 
-        let num_incomplete_views = self
-            .ctx
-            .round_results
-            .len()
-            .saturating_sub(self.ctx.successful_views.len())
-            .saturating_sub(self.ctx.failed_views.len());
+        let num_incomplete_views = self.ctx.round_results.len()
+            - self.ctx.successful_views.len()
+            - self.ctx.failed_views.len();
 
         if self.ctx.successful_views.len() < num_successful_views {
             return TestResult::Fail(Box::new(OverallSafetyTaskErr::<TYPES>::NotEnoughDecides {
@@ -540,6 +548,40 @@ impl<TYPES: NodeType> RoundResult<TYPES> {
         }
         leaves
     }
+
+    fn cleanup_previous_timeouts_on_view(
+        &mut self,
+        failed_views: &mut HashSet<TYPES::Time>,
+        view_number: &TYPES::Time,
+        id: &u64,
+    ) {
+        // check if this node had a previous timeout
+        match self.failed_nodes.get(id) {
+            Some(error) => match error.as_ref() {
+                HotShotError::ViewTimeoutError {
+                    view_number,
+                    state: _,
+                } => {
+                    tracing::debug!(
+                        "Node {} originally timeout for view: {:?}. It has now been decided on.",
+                        id,
+                        view_number
+                    );
+                    self.failed_nodes.remove(id);
+                }
+                _ => return,
+            },
+            None => return,
+        }
+
+        // check if no more failed nodes
+        if self.failed_nodes.is_empty() && failed_views.remove(view_number) {
+            tracing::debug!(
+                "Removed view {:?} from failed views, all nodes have agreed upon view.",
+                view_number
+            );
+        }
+    }
 }
 
 /// cross node safety properties

From be0837505df6fabf4998d8e81f8e680213706f7b Mon Sep 17 00:00:00 2001
From: Luke Iannucci <luke.iannucci@outlook.com>
Date: Thu, 12 Sep 2024 09:48:20 -0400
Subject: [PATCH 2/6] test ci for new failure

---
 crates/testing/src/overall_safety_task.rs | 25 +++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/crates/testing/src/overall_safety_task.rs b/crates/testing/src/overall_safety_task.rs
index f63415a963..cb4e5c4536 100644
--- a/crates/testing/src/overall_safety_task.rs
+++ b/crates/testing/src/overall_safety_task.rs
@@ -82,6 +82,12 @@ pub enum OverallSafetyTaskErr<TYPES: NodeType> {
         expected_failed_views: HashSet<TYPES::Time>,
         actual_failed_views: HashSet<TYPES::Time>,
     },
+    /// This is a case where we have too many failured + succesful views over round results
+    /// This should never be the case and requires debugging if we see this get thrown
+    NotEnoughRoundResults {
+        results_count: usize,
+        views_count: usize,
+    },
 }
 
 /// Data availability task state
@@ -260,9 +266,20 @@ impl<TYPES: NodeType, I: TestableNodeImplementation<TYPES>, V: Versions> TestTas
             expected_views_to_fail,
         }: OverallSafetyPropertiesDescription<TYPES> = self.properties.clone();
 
-        let num_incomplete_views = self.ctx.round_results.len()
-            - self.ctx.successful_views.len()
-            - self.ctx.failed_views.len();
+        let views_count = self.ctx.failed_views.len() + self.ctx.successful_views.len();
+        let results_count = self.ctx.round_results.len();
+
+        // This can cause tests to crash if we do the subtracting to get `num_incomplete_views` below, so throw an error instead
+        // Use this instead of saturating_sub as that could hide a real problem
+        if views_count > results_count {
+            return TestResult::Fail(Box::new(
+                OverallSafetyTaskErr::<TYPES>::NotEnoughRoundResults {
+                    results_count: results_count,
+                    views_count: views_count,
+                },
+            ));
+        }
+        let num_incomplete_views = views_count - results_count;
 
         if self.ctx.successful_views.len() < num_successful_views {
             return TestResult::Fail(Box::new(OverallSafetyTaskErr::<TYPES>::NotEnoughDecides {
@@ -575,7 +592,7 @@ impl<TYPES: NodeType> RoundResult<TYPES> {
         }
 
         // check if no more failed nodes
-        if self.failed_nodes.is_empty() && failed_views.remove(view_number) {
+        if self.failed_nodes.is_empty() && failed_views.contains(view_number) {
             tracing::debug!(
                 "Removed view {:?} from failed views, all nodes have agreed upon view.",
                 view_number

From 5f731dcf2323436853e7cd0d6125ada9a5eb0dcc Mon Sep 17 00:00:00 2001
From: Luke Iannucci <luke.iannucci@outlook.com>
Date: Thu, 12 Sep 2024 10:05:19 -0400
Subject: [PATCH 3/6] subtract properly

---
 crates/testing/src/overall_safety_task.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crates/testing/src/overall_safety_task.rs b/crates/testing/src/overall_safety_task.rs
index cb4e5c4536..82f42b034b 100644
--- a/crates/testing/src/overall_safety_task.rs
+++ b/crates/testing/src/overall_safety_task.rs
@@ -82,7 +82,7 @@ pub enum OverallSafetyTaskErr<TYPES: NodeType> {
         expected_failed_views: HashSet<TYPES::Time>,
         actual_failed_views: HashSet<TYPES::Time>,
     },
-    /// This is a case where we have too many failured + succesful views over round results
+    /// This is a case where we have too many failed + succesful views over round results
     /// This should never be the case and requires debugging if we see this get thrown
     NotEnoughRoundResults {
         results_count: usize,
@@ -274,12 +274,12 @@ impl<TYPES: NodeType, I: TestableNodeImplementation<TYPES>, V: Versions> TestTas
         if views_count > results_count {
             return TestResult::Fail(Box::new(
                 OverallSafetyTaskErr::<TYPES>::NotEnoughRoundResults {
-                    results_count: results_count,
-                    views_count: views_count,
+                    results_count,
+                    views_count,
                 },
             ));
         }
-        let num_incomplete_views = views_count - results_count;
+        let num_incomplete_views = results_count - views_count;
 
         if self.ctx.successful_views.len() < num_successful_views {
             return TestResult::Fail(Box::new(OverallSafetyTaskErr::<TYPES>::NotEnoughDecides {

From ac5a93a775dd477eb795878d5dd7403ffb03d5cc Mon Sep 17 00:00:00 2001
From: Luke Iannucci <luke.iannucci@outlook.com>
Date: Thu, 12 Sep 2024 10:27:39 -0400
Subject: [PATCH 4/6] cleanup

---
 crates/testing/src/overall_safety_task.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/crates/testing/src/overall_safety_task.rs b/crates/testing/src/overall_safety_task.rs
index 82f42b034b..2a7066b07c 100644
--- a/crates/testing/src/overall_safety_task.rs
+++ b/crates/testing/src/overall_safety_task.rs
@@ -269,8 +269,9 @@ impl<TYPES: NodeType, I: TestableNodeImplementation<TYPES>, V: Versions> TestTas
         let views_count = self.ctx.failed_views.len() + self.ctx.successful_views.len();
         let results_count = self.ctx.round_results.len();
 
-        // This can cause tests to crash if we do the subtracting to get `num_incomplete_views` below, so throw an error instead
-        // Use this instead of saturating_sub as that could hide a real problem
+        // This can cause tests to crash if we do the subtracting to get `num_incomplete_views` below
+        // So lets fail return an error instead
+        // Use this check instead of saturating_sub as that could hide a real problem
         if views_count > results_count {
             return TestResult::Fail(Box::new(
                 OverallSafetyTaskErr::<TYPES>::NotEnoughRoundResults {
@@ -592,7 +593,7 @@ impl<TYPES: NodeType> RoundResult<TYPES> {
         }
 
         // check if no more failed nodes
-        if self.failed_nodes.is_empty() && failed_views.contains(view_number) {
+        if self.failed_nodes.is_empty() && failed_views.remove(view_number) {
             tracing::debug!(
                 "Removed view {:?} from failed views, all nodes have agreed upon view.",
                 view_number

From 8abaa2cccce246ba61eb0ec4686b3055cbb7083e Mon Sep 17 00:00:00 2001
From: Luke Iannucci <luke.iannucci@outlook.com>
Date: Thu, 12 Sep 2024 11:02:23 -0400
Subject: [PATCH 5/6] spin up node a few views earlier

---
 crates/testing/tests/tests_2/catchup.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/testing/tests/tests_2/catchup.rs b/crates/testing/tests/tests_2/catchup.rs
index 16d7e530ea..125812c710 100644
--- a/crates/testing/tests/tests_2/catchup.rs
+++ b/crates/testing/tests/tests_2/catchup.rs
@@ -41,7 +41,7 @@ async fn test_catchup() {
 
     metadata.spinning_properties = SpinningTaskDescription {
         // Start the nodes before their leadership.
-        node_changes: vec![(13, catchup_node)],
+        node_changes: vec![(8, catchup_node)],
     };
 
     metadata.completion_task_description =

From cb2c538eef5536a91766488185c3d92cb50e5a51 Mon Sep 17 00:00:00 2001
From: Luke Iannucci <luke.iannucci@outlook.com>
Date: Thu, 12 Sep 2024 12:10:19 -0400
Subject: [PATCH 6/6] increase round start delay

---
 crates/testing/tests/tests_2/catchup.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/crates/testing/tests/tests_2/catchup.rs b/crates/testing/tests/tests_2/catchup.rs
index 125812c710..8253ae8084 100644
--- a/crates/testing/tests/tests_2/catchup.rs
+++ b/crates/testing/tests/tests_2/catchup.rs
@@ -23,6 +23,9 @@ async fn test_catchup() {
     async_compatibility_layer::logging::setup_backtrace();
     let timing_data = TimingData {
         next_view_timeout: 2000,
+        // increase the round delay for this test
+        // TODO: remove this delay increase for test - https://github.com/EspressoSystems/HotShot/issues/3673
+        round_start_delay: 200,
         ..Default::default()
     };
     let mut metadata: TestDescription<TestTypes, MemoryImpl, TestVersions> =
@@ -41,7 +44,7 @@ async fn test_catchup() {
 
     metadata.spinning_properties = SpinningTaskDescription {
         // Start the nodes before their leadership.
-        node_changes: vec![(8, catchup_node)],
+        node_changes: vec![(13, catchup_node)],
     };
 
     metadata.completion_task_description =