Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[forge][chaos] Expand chaos simulation to six regions #14860

Merged
merged 4 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions testsuite/forge-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -811,9 +811,12 @@ fn get_multi_region_test(test_name: &str) -> Option<ForgeConfig> {
Some(test)
}

fn wrap_with_realistic_env<T: NetworkTest + 'static>(test: T) -> CompositeNetworkTest {
fn wrap_with_realistic_env<T: NetworkTest + 'static>(
num_validators: usize,
test: T,
) -> CompositeNetworkTest {
CompositeNetworkTest::new_with_two_wrappers(
MultiRegionNetworkEmulationTest::default(),
MultiRegionNetworkEmulationTest::default_for_validator_count(num_validators),
CpuChaosTest::default(),
test,
)
Expand Down Expand Up @@ -858,8 +861,9 @@ fn wrap_with_two_region_env<T: NetworkTest + 'static>(test: T) -> CompositeNetwo
}

fn run_consensus_only_realistic_env_max_tps() -> ForgeConfig {
let num_validators = 20;
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(20).unwrap())
.with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap())
.with_emit_job(
EmitJobRequest::default()
.mode(EmitJobMode::MaxLoad {
Expand All @@ -868,7 +872,7 @@ fn run_consensus_only_realistic_env_max_tps() -> ForgeConfig {
.txn_expiration_time_secs(5 * 60),
)
.add_network_test(CompositeNetworkTest::new(
MultiRegionNetworkEmulationTest::default(),
MultiRegionNetworkEmulationTest::default_for_validator_count(num_validators),
CpuChaosTest::default(),
))
.with_genesis_helm_config_fn(Arc::new(|helm_values| {
Expand Down Expand Up @@ -1119,7 +1123,7 @@ fn realistic_env_sweep_wrap(
.with_validator_override_node_config_fn(Arc::new(|config, _| {
config.execution.processed_transactions_detailed_counters = true;
}))
.add_network_test(wrap_with_realistic_env(test))
.add_network_test(wrap_with_realistic_env(num_validators, test))
// Test inherits the main EmitJobRequest, so update here for more precise latency measurements
.with_emit_job(
EmitJobRequest::default().latency_polling_interval(Duration::from_millis(100)),
Expand Down Expand Up @@ -1388,10 +1392,11 @@ fn workload_vs_perf_benchmark() -> ForgeConfig {
}

fn realistic_env_graceful_overload(duration: Duration) -> ForgeConfig {
let num_validators = 20;
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(20).unwrap())
.with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap())
.with_initial_fullnode_count(20)
.add_network_test(wrap_with_realistic_env(TwoTrafficsTest {
.add_network_test(wrap_with_realistic_env(num_validators, TwoTrafficsTest {
inner_traffic: EmitJobRequest::default()
.mode(EmitJobMode::ConstTps { tps: 15000 })
.init_gas_price_multiplier(20),
Expand Down Expand Up @@ -1952,7 +1957,7 @@ fn realistic_env_max_load_test(
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap())
.with_initial_fullnode_count(num_fullnodes)
.add_network_test(wrap_with_realistic_env(TwoTrafficsTest {
.add_network_test(wrap_with_realistic_env(num_validators, TwoTrafficsTest {
inner_traffic: EmitJobRequest::default()
.mode(EmitJobMode::MaxLoad { mempool_backlog })
.init_gas_price_multiplier(20),
Expand Down Expand Up @@ -2013,7 +2018,7 @@ fn realistic_network_tuned_for_throughput_test() -> ForgeConfig {

let mut forge_config = ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(VALIDATOR_COUNT).unwrap())
.add_network_test(MultiRegionNetworkEmulationTest::default())
.add_network_test(MultiRegionNetworkEmulationTest::default_for_validator_count(VALIDATOR_COUNT))
.with_emit_job(EmitJobRequest::default().mode(EmitJobMode::MaxLoad {
mempool_backlog: (TARGET_TPS as f64 * VFN_LATENCY_S) as usize,
}))
Expand Down Expand Up @@ -2326,8 +2331,9 @@ fn quorum_store_reconfig_enable_test() -> ForgeConfig {
}

fn mainnet_like_simulation_test() -> ForgeConfig {
let num_validators = 20;
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(20).unwrap())
.with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap())
.with_emit_job(
EmitJobRequest::default()
.mode(EmitJobMode::MaxLoad {
Expand All @@ -2336,7 +2342,7 @@ fn mainnet_like_simulation_test() -> ForgeConfig {
.txn_expiration_time_secs(5 * 60),
)
.add_network_test(CompositeNetworkTest::new(
MultiRegionNetworkEmulationTest::default(),
MultiRegionNetworkEmulationTest::default_for_validator_count(num_validators),
CpuChaosTest::default(),
))
.with_genesis_helm_config_fn(Arc::new(|helm_values| {
Expand Down
2 changes: 1 addition & 1 deletion testsuite/forge-cli/src/suites/dag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ fn dag_realistic_env_max_load_test(
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap())
.with_initial_fullnode_count(num_fullnodes)
.add_network_test(wrap_with_realistic_env(TwoTrafficsTest {
.add_network_test(wrap_with_realistic_env(num_validators, TwoTrafficsTest {
inner_traffic: EmitJobRequest::default()
.mode(EmitJobMode::MaxLoad {
mempool_backlog: 50000,
Expand Down
6 changes: 4 additions & 2 deletions testsuite/forge/src/backend/k8s/swarm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -735,8 +735,8 @@ trait ChaosExperimentOps {
async fn list_stress_chaos(&self) -> Result<Vec<StressChaos>>;

async fn ensure_chaos_experiments_active(&self) -> Result<()> {
let timeout_duration = Duration::from_secs(300); // 5 minutes
let polling_interval = Duration::from_secs(5);
let timeout_duration = Duration::from_secs(600); // 10 minutes
let polling_interval = Duration::from_secs(10);

tokio::time::timeout(timeout_duration, async {
loop {
Expand Down Expand Up @@ -793,6 +793,8 @@ fn check_all_injected(status: &Option<ChaosStatus>) -> bool {
.map_or(false, |conditions| {
conditions.iter().any(|c| {
c.r#type == ChaosConditionType::AllInjected && c.status == ConditionStatus::True
}) && conditions.iter().any(|c| {
c.r#type == ChaosConditionType::Selected && c.status == ConditionStatus::True
})
})
}
Expand Down
31 changes: 31 additions & 0 deletions testsuite/testcases/src/data/six_region_link_stats.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
sending_region,receiving_region,bitrate_bps,avgrtt
gcp--us-central1,aws--eu-west-1,300000000,103.435
gcp--us-central1,aws--ap-northeast-1,300000000,133.996
gcp--us-central1,aws--sa-east-1,300000000,145.483
gcp--us-central1,aws--eu-central1,300000000,107.671
gcp--us-central1,gcp--ca-central1,300000000,29.748
aws--sa-east-1,gcp--us-central1,300000000,145.703
aws--sa-east-1,aws--eu-west-1,300000000,176.894
aws--sa-east-1,aws--ap-northeast-1,300000000,255.289
aws--sa-east-1,aws--eu-central1,300000000,203.508
aws--sa-east-1,gcp--ca-central-1,300000000,124.307
aws--eu-west-1,gcp--us-central1,300000000,104.169
aws--eu-west-1,aws--sa-east-1,300000000,176.813
aws--eu-west-1,aws--ap-northeast-1,300000000,198.555
aws--eu-west-1,aws--eu-central1,300000000,23.493
aws--eu-west-1,gcp--ca-central-1,300000000,68.622
aws--ap-northeast-1,gcp--us-central1,300000000,128.999
aws--ap-northeast-1,aws--eu-west-1,300000000,198.539
aws--ap-northeast-1,aws--sa-east-1,300000000,255.323
aws--ap-northeast-1,aws--eu-central1,300000000,223.400
aws--ap-northeast-1,gcp--ca-central-1,300000000,142.549
aws--eu-central1,gcp--us-central1,300000000,107.671
aws--eu-central1,aws--sa-east-1,300000000,203.508
aws--eu-central1,aws--eu-west-1,300000000,23.493
aws--eu-central1,aws--ap-northeast-1,300000000,223.400
aws--eu-central1,gcp--ca-central-1,300000000,89.889
gcp--ca-central-1,gcp--us-central1,300000000,29.748
gcp--ca-central-1,aws--sa-east-1,300000000,124.307
gcp--ca-central-1,aws--eu-west-1,300000000,68.622
gcp--ca-central-1,aws--ap-northeast-1,300000000,142.549
gcp--ca-central-1,aws--eu-central1,300000000,89.889
32 changes: 29 additions & 3 deletions testsuite/testcases/src/multi_region_network_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use std::{collections::BTreeMap, sync::Arc};
/// is measuring TCP bandwidth only which is primarily affected by RTT, and not the actual bandwidth
/// across the regions, which would vary according to competing traffic, etc.
const FOUR_REGION_LINK_STATS: &[u8] = include_bytes!("data/four_region_link_stats.csv");
const SIX_REGION_LINK_STATS: &[u8] = include_bytes!("data/six_region_link_stats.csv");
/// The two regions were chosen as the most distant regions among the four regions set.
const TWO_REGION_LINK_STATS: &[u8] = include_bytes!("data/two_region_link_stats.csv");

Expand Down Expand Up @@ -79,8 +80,8 @@ fn create_link_stats_table_with_peer_groups(
"At least 2 regions are required for inter-region network chaos."
);
assert!(
number_of_regions <= 4,
"ChaosMesh only supports simulating up to 4 regions."
number_of_regions <= 6,
"ChaosMesh only supports simulating up to 6 regions."
);

// Create the link stats table with peer groups
Expand Down Expand Up @@ -237,10 +238,23 @@ impl MultiRegionNetworkEmulationConfig {
..Default::default()
}
}

pub fn four_regions() -> Self {
Self {
link_stats_table: get_link_stats_table(FOUR_REGION_LINK_STATS),
..Default::default()
}
}

pub fn six_regions() -> Self {
Self {
link_stats_table: get_link_stats_table(SIX_REGION_LINK_STATS),
..Default::default()
}
}
}

/// A test to emulate network conditions for a multi-region setup.
#[derive(Default)]
pub struct MultiRegionNetworkEmulationTest {
network_emulation_config: MultiRegionNetworkEmulationConfig,
}
Expand All @@ -252,6 +266,18 @@ impl MultiRegionNetworkEmulationTest {
}
}

pub fn default_for_validator_count(num_validators: usize) -> Self {
if num_validators > 100 {
Self {
network_emulation_config: MultiRegionNetworkEmulationConfig::six_regions(),
}
} else {
Self {
network_emulation_config: MultiRegionNetworkEmulationConfig::four_regions(),
}
}
}

/// Creates a new SwarmNetEm to be injected via chaos. Note: network
/// emulation is only done for the validators in the swarm (and not
/// the fullnodes).
Expand Down
Loading