Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TESTING] - Fix Startup / Restart Test Failures #3669

Merged
merged 6 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 68 additions & 8 deletions crates/testing/src/overall_safety_task.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ pub enum OverallSafetyTaskErr<TYPES: NodeType> {
expected_failed_views: HashSet<TYPES::Time>,
actual_failed_views: HashSet<TYPES::Time>,
},
/// This is a case where we have too many failed + succesful views over round results
/// This should never be the case and requires debugging if we see this get thrown
NotEnoughRoundResults {
results_count: usize,
views_count: usize,
},
}

/// Data availability task state
Expand Down Expand Up @@ -165,8 +171,19 @@ impl<TYPES: NodeType, I: TestableNodeImplementation<TYPES>, V: Versions> TestTas
let paired_up = (leaf_chain.to_vec(), (*qc).clone());
match self.ctx.round_results.entry(view_number) {
Entry::Occupied(mut o) => {
o.get_mut()
.insert_into_result(id, paired_up, maybe_block_size)
let entry = o.get_mut();
let leaf = entry.insert_into_result(id, paired_up, maybe_block_size);

// Here we noticed is a node may start up and time out waiting for a proposal
// So we add the timeout to failed_views, but eventually the proposal is received we decide on the view
// If we do indeed have a view timeout for the node at this point we want to remove it
entry.cleanup_previous_timeouts_on_view(
&mut self.ctx.failed_views,
&view_number,
&(id as u64),
);

leaf
}
Entry::Vacant(v) => {
let mut round_result = RoundResult::default();
Expand Down Expand Up @@ -249,12 +266,21 @@ impl<TYPES: NodeType, I: TestableNodeImplementation<TYPES>, V: Versions> TestTas
expected_views_to_fail,
}: OverallSafetyPropertiesDescription<TYPES> = self.properties.clone();

let num_incomplete_views = self
.ctx
.round_results
.len()
.saturating_sub(self.ctx.successful_views.len())
.saturating_sub(self.ctx.failed_views.len());
let views_count = self.ctx.failed_views.len() + self.ctx.successful_views.len();
let results_count = self.ctx.round_results.len();

// This can cause tests to crash if we do the subtracting to get `num_incomplete_views` below
// So lets fail return an error instead
// Use this check instead of saturating_sub as that could hide a real problem
if views_count > results_count {
return TestResult::Fail(Box::new(
OverallSafetyTaskErr::<TYPES>::NotEnoughRoundResults {
results_count,
views_count,
},
));
}
let num_incomplete_views = results_count - views_count;

if self.ctx.successful_views.len() < num_successful_views {
return TestResult::Fail(Box::new(OverallSafetyTaskErr::<TYPES>::NotEnoughDecides {
Expand Down Expand Up @@ -540,6 +566,40 @@ impl<TYPES: NodeType> RoundResult<TYPES> {
}
leaves
}

fn cleanup_previous_timeouts_on_view(
&mut self,
failed_views: &mut HashSet<TYPES::Time>,
view_number: &TYPES::Time,
id: &u64,
) {
// check if this node had a previous timeout
match self.failed_nodes.get(id) {
Some(error) => match error.as_ref() {
HotShotError::ViewTimeoutError {
view_number,
state: _,
} => {
tracing::debug!(
"Node {} originally timeout for view: {:?}. It has now been decided on.",
id,
view_number
);
self.failed_nodes.remove(id);
}
_ => return,
},
None => return,
}

// check if no more failed nodes
if self.failed_nodes.is_empty() && failed_views.remove(view_number) {
tracing::debug!(
"Removed view {:?} from failed views, all nodes have agreed upon view.",
view_number
);
}
}
}

/// cross node safety properties
Expand Down
3 changes: 3 additions & 0 deletions crates/testing/tests/tests_2/catchup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ async fn test_catchup() {
async_compatibility_layer::logging::setup_backtrace();
let timing_data = TimingData {
next_view_timeout: 2000,
// increase the round delay for this test
// TODO: remove this delay increase for test - https://github.com/EspressoSystems/HotShot/issues/3673
round_start_delay: 200,
..Default::default()
};
let mut metadata: TestDescription<TestTypes, MemoryImpl, TestVersions> =
Expand Down