Skip to content

Commit

Permalink
[TESTING] - Fix Startup / Restart Test Failures (#3669)
Browse files Browse the repository at this point in the history
* add logic to remove from failed views if node decides on view after timeout

* test ci for new failure

* subtract properly

* cleanup

* spin up node a few views earlier

* increase round start delay
  • Loading branch information
lukeiannucci authored Sep 12, 2024
1 parent 45f70a4 commit 7879bf0
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 8 deletions.
76 changes: 68 additions & 8 deletions crates/testing/src/overall_safety_task.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ pub enum OverallSafetyTaskErr<TYPES: NodeType> {
expected_failed_views: HashSet<TYPES::Time>,
actual_failed_views: HashSet<TYPES::Time>,
},
/// This is a case where we have too many failed + succesful views over round results
/// This should never be the case and requires debugging if we see this get thrown
NotEnoughRoundResults {
results_count: usize,
views_count: usize,
},
}

/// Data availability task state
Expand Down Expand Up @@ -165,8 +171,19 @@ impl<TYPES: NodeType, I: TestableNodeImplementation<TYPES>, V: Versions> TestTas
let paired_up = (leaf_chain.to_vec(), (*qc).clone());
match self.ctx.round_results.entry(view_number) {
Entry::Occupied(mut o) => {
o.get_mut()
.insert_into_result(id, paired_up, maybe_block_size)
let entry = o.get_mut();
let leaf = entry.insert_into_result(id, paired_up, maybe_block_size);

// Here we noticed is a node may start up and time out waiting for a proposal
// So we add the timeout to failed_views, but eventually the proposal is received we decide on the view
// If we do indeed have a view timeout for the node at this point we want to remove it
entry.cleanup_previous_timeouts_on_view(
&mut self.ctx.failed_views,
&view_number,
&(id as u64),
);

leaf
}
Entry::Vacant(v) => {
let mut round_result = RoundResult::default();
Expand Down Expand Up @@ -249,12 +266,21 @@ impl<TYPES: NodeType, I: TestableNodeImplementation<TYPES>, V: Versions> TestTas
expected_views_to_fail,
}: OverallSafetyPropertiesDescription<TYPES> = self.properties.clone();

let num_incomplete_views = self
.ctx
.round_results
.len()
.saturating_sub(self.ctx.successful_views.len())
.saturating_sub(self.ctx.failed_views.len());
let views_count = self.ctx.failed_views.len() + self.ctx.successful_views.len();
let results_count = self.ctx.round_results.len();

// This can cause tests to crash if we do the subtracting to get `num_incomplete_views` below
// So lets fail return an error instead
// Use this check instead of saturating_sub as that could hide a real problem
if views_count > results_count {
return TestResult::Fail(Box::new(
OverallSafetyTaskErr::<TYPES>::NotEnoughRoundResults {
results_count,
views_count,
},
));
}
let num_incomplete_views = results_count - views_count;

if self.ctx.successful_views.len() < num_successful_views {
return TestResult::Fail(Box::new(OverallSafetyTaskErr::<TYPES>::NotEnoughDecides {
Expand Down Expand Up @@ -540,6 +566,40 @@ impl<TYPES: NodeType> RoundResult<TYPES> {
}
leaves
}

fn cleanup_previous_timeouts_on_view(
&mut self,
failed_views: &mut HashSet<TYPES::Time>,
view_number: &TYPES::Time,
id: &u64,
) {
// check if this node had a previous timeout
match self.failed_nodes.get(id) {
Some(error) => match error.as_ref() {
HotShotError::ViewTimeoutError {
view_number,
state: _,
} => {
tracing::debug!(
"Node {} originally timeout for view: {:?}. It has now been decided on.",
id,
view_number
);
self.failed_nodes.remove(id);
}
_ => return,
},
None => return,
}

// check if no more failed nodes
if self.failed_nodes.is_empty() && failed_views.remove(view_number) {
tracing::debug!(
"Removed view {:?} from failed views, all nodes have agreed upon view.",
view_number
);
}
}
}

/// cross node safety properties
Expand Down
3 changes: 3 additions & 0 deletions crates/testing/tests/tests_2/catchup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ async fn test_catchup() {
async_compatibility_layer::logging::setup_backtrace();
let timing_data = TimingData {
next_view_timeout: 2000,
// increase the round delay for this test
// TODO: remove this delay increase for test - https://github.com/EspressoSystems/HotShot/issues/3673
round_start_delay: 200,
..Default::default()
};
let mut metadata: TestDescription<TestTypes, MemoryImpl, TestVersions> =
Expand Down

0 comments on commit 7879bf0

Please sign in to comment.