Skip to content

Commit

Permalink
[forge][chaos] Expand chaos simulation to six regions (#14860)
Browse files Browse the repository at this point in the history
* [forge][chaos] expand to six regions

* increase chaos timeout interval

* 4 region is default
  • Loading branch information
ibalajiarun authored Oct 16, 2024
1 parent c8511eb commit 8a540dc
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 25 deletions.
28 changes: 17 additions & 11 deletions testsuite/forge-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -811,9 +811,12 @@ fn get_multi_region_test(test_name: &str) -> Option<ForgeConfig> {
Some(test)
}

fn wrap_with_realistic_env<T: NetworkTest + 'static>(test: T) -> CompositeNetworkTest {
fn wrap_with_realistic_env<T: NetworkTest + 'static>(
num_validators: usize,
test: T,
) -> CompositeNetworkTest {
CompositeNetworkTest::new_with_two_wrappers(
MultiRegionNetworkEmulationTest::default(),
MultiRegionNetworkEmulationTest::default_for_validator_count(num_validators),
CpuChaosTest::default(),
test,
)
Expand Down Expand Up @@ -858,8 +861,9 @@ fn wrap_with_two_region_env<T: NetworkTest + 'static>(test: T) -> CompositeNetwo
}

fn run_consensus_only_realistic_env_max_tps() -> ForgeConfig {
let num_validators = 20;
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(20).unwrap())
.with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap())
.with_emit_job(
EmitJobRequest::default()
.mode(EmitJobMode::MaxLoad {
Expand All @@ -868,7 +872,7 @@ fn run_consensus_only_realistic_env_max_tps() -> ForgeConfig {
.txn_expiration_time_secs(5 * 60),
)
.add_network_test(CompositeNetworkTest::new(
MultiRegionNetworkEmulationTest::default(),
MultiRegionNetworkEmulationTest::default_for_validator_count(num_validators),
CpuChaosTest::default(),
))
.with_genesis_helm_config_fn(Arc::new(|helm_values| {
Expand Down Expand Up @@ -1119,7 +1123,7 @@ fn realistic_env_sweep_wrap(
.with_validator_override_node_config_fn(Arc::new(|config, _| {
config.execution.processed_transactions_detailed_counters = true;
}))
.add_network_test(wrap_with_realistic_env(test))
.add_network_test(wrap_with_realistic_env(num_validators, test))
// Test inherits the main EmitJobRequest, so update here for more precise latency measurements
.with_emit_job(
EmitJobRequest::default().latency_polling_interval(Duration::from_millis(100)),
Expand Down Expand Up @@ -1388,10 +1392,11 @@ fn workload_vs_perf_benchmark() -> ForgeConfig {
}

fn realistic_env_graceful_overload(duration: Duration) -> ForgeConfig {
let num_validators = 20;
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(20).unwrap())
.with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap())
.with_initial_fullnode_count(20)
.add_network_test(wrap_with_realistic_env(TwoTrafficsTest {
.add_network_test(wrap_with_realistic_env(num_validators, TwoTrafficsTest {
inner_traffic: EmitJobRequest::default()
.mode(EmitJobMode::ConstTps { tps: 15000 })
.init_gas_price_multiplier(20),
Expand Down Expand Up @@ -1952,7 +1957,7 @@ fn realistic_env_max_load_test(
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap())
.with_initial_fullnode_count(num_fullnodes)
.add_network_test(wrap_with_realistic_env(TwoTrafficsTest {
.add_network_test(wrap_with_realistic_env(num_validators, TwoTrafficsTest {
inner_traffic: EmitJobRequest::default()
.mode(EmitJobMode::MaxLoad { mempool_backlog })
.init_gas_price_multiplier(20),
Expand Down Expand Up @@ -2013,7 +2018,7 @@ fn realistic_network_tuned_for_throughput_test() -> ForgeConfig {

let mut forge_config = ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(VALIDATOR_COUNT).unwrap())
.add_network_test(MultiRegionNetworkEmulationTest::default())
.add_network_test(MultiRegionNetworkEmulationTest::default_for_validator_count(VALIDATOR_COUNT))
.with_emit_job(EmitJobRequest::default().mode(EmitJobMode::MaxLoad {
mempool_backlog: (TARGET_TPS as f64 * VFN_LATENCY_S) as usize,
}))
Expand Down Expand Up @@ -2326,8 +2331,9 @@ fn quorum_store_reconfig_enable_test() -> ForgeConfig {
}

fn mainnet_like_simulation_test() -> ForgeConfig {
let num_validators = 20;
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(20).unwrap())
.with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap())
.with_emit_job(
EmitJobRequest::default()
.mode(EmitJobMode::MaxLoad {
Expand All @@ -2336,7 +2342,7 @@ fn mainnet_like_simulation_test() -> ForgeConfig {
.txn_expiration_time_secs(5 * 60),
)
.add_network_test(CompositeNetworkTest::new(
MultiRegionNetworkEmulationTest::default(),
MultiRegionNetworkEmulationTest::default_for_validator_count(num_validators),
CpuChaosTest::default(),
))
.with_genesis_helm_config_fn(Arc::new(|helm_values| {
Expand Down
2 changes: 1 addition & 1 deletion testsuite/forge-cli/src/suites/dag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ fn dag_realistic_env_max_load_test(
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(num_validators).unwrap())
.with_initial_fullnode_count(num_fullnodes)
.add_network_test(wrap_with_realistic_env(TwoTrafficsTest {
.add_network_test(wrap_with_realistic_env(num_validators, TwoTrafficsTest {
inner_traffic: EmitJobRequest::default()
.mode(EmitJobMode::MaxLoad {
mempool_backlog: 50000,
Expand Down
34 changes: 24 additions & 10 deletions testsuite/forge/src/backend/k8s/swarm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -735,8 +735,8 @@ trait ChaosExperimentOps {
async fn list_stress_chaos(&self) -> Result<Vec<StressChaos>>;

async fn ensure_chaos_experiments_active(&self) -> Result<()> {
let timeout_duration = Duration::from_secs(300); // 5 minutes
let polling_interval = Duration::from_secs(5);
let timeout_duration = Duration::from_secs(600); // 10 minutes
let polling_interval = Duration::from_secs(10);

tokio::time::timeout(timeout_duration, async {
loop {
Expand Down Expand Up @@ -793,6 +793,8 @@ fn check_all_injected(status: &Option<ChaosStatus>) -> bool {
.map_or(false, |conditions| {
conditions.iter().any(|c| {
c.r#type == ChaosConditionType::AllInjected && c.status == ConditionStatus::True
}) && conditions.iter().any(|c| {
c.r#type == ChaosConditionType::Selected && c.status == ConditionStatus::True
})
})
}
Expand Down Expand Up @@ -870,19 +872,31 @@ mod tests {
) -> (Vec<NetworkChaos>, Vec<StressChaos>) {
let network_chaos = NetworkChaos {
status: Some(ChaosStatus {
conditions: Some(vec![ChaosCondition {
r#type: ChaosConditionType::AllInjected,
status: network_status,
}]),
conditions: Some(vec![
ChaosCondition {
r#type: ChaosConditionType::AllInjected,
status: network_status.clone(),
},
ChaosCondition {
r#type: ChaosConditionType::Selected,
status: network_status,
},
]),
}),
..NetworkChaos::new("test", Default::default())
};
let stress_chaos = StressChaos {
status: Some(ChaosStatus {
conditions: Some(vec![ChaosCondition {
r#type: ChaosConditionType::AllInjected,
status: stress_status,
}]),
conditions: Some(vec![
ChaosCondition {
r#type: ChaosConditionType::AllInjected,
status: stress_status.clone(),
},
ChaosCondition {
r#type: ChaosConditionType::Selected,
status: stress_status,
},
]),
}),
..StressChaos::new("test", Default::default())
};
Expand Down
31 changes: 31 additions & 0 deletions testsuite/testcases/src/data/six_region_link_stats.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
sending_region,receiving_region,bitrate_bps,avgrtt
gcp--us-central1,aws--eu-west-1,300000000,103.435
gcp--us-central1,aws--ap-northeast-1,300000000,133.996
gcp--us-central1,aws--sa-east-1,300000000,145.483
gcp--us-central1,aws--eu-central1,300000000,107.671
gcp--us-central1,gcp--ca-central1,300000000,29.748
aws--sa-east-1,gcp--us-central1,300000000,145.703
aws--sa-east-1,aws--eu-west-1,300000000,176.894
aws--sa-east-1,aws--ap-northeast-1,300000000,255.289
aws--sa-east-1,aws--eu-central1,300000000,203.508
aws--sa-east-1,gcp--ca-central-1,300000000,124.307
aws--eu-west-1,gcp--us-central1,300000000,104.169
aws--eu-west-1,aws--sa-east-1,300000000,176.813
aws--eu-west-1,aws--ap-northeast-1,300000000,198.555
aws--eu-west-1,aws--eu-central1,300000000,23.493
aws--eu-west-1,gcp--ca-central-1,300000000,68.622
aws--ap-northeast-1,gcp--us-central1,300000000,128.999
aws--ap-northeast-1,aws--eu-west-1,300000000,198.539
aws--ap-northeast-1,aws--sa-east-1,300000000,255.323
aws--ap-northeast-1,aws--eu-central1,300000000,223.400
aws--ap-northeast-1,gcp--ca-central-1,300000000,142.549
aws--eu-central1,gcp--us-central1,300000000,107.671
aws--eu-central1,aws--sa-east-1,300000000,203.508
aws--eu-central1,aws--eu-west-1,300000000,23.493
aws--eu-central1,aws--ap-northeast-1,300000000,223.400
aws--eu-central1,gcp--ca-central-1,300000000,89.889
gcp--ca-central-1,gcp--us-central1,300000000,29.748
gcp--ca-central-1,aws--sa-east-1,300000000,124.307
gcp--ca-central-1,aws--eu-west-1,300000000,68.622
gcp--ca-central-1,aws--ap-northeast-1,300000000,142.549
gcp--ca-central-1,aws--eu-central1,300000000,89.889
32 changes: 29 additions & 3 deletions testsuite/testcases/src/multi_region_network_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use std::{collections::BTreeMap, sync::Arc};
/// is measuring TCP bandwidth only which is primarily affected by RTT, and not the actual bandwidth
/// across the regions, which would vary according to competing traffic, etc.
const FOUR_REGION_LINK_STATS: &[u8] = include_bytes!("data/four_region_link_stats.csv");
const SIX_REGION_LINK_STATS: &[u8] = include_bytes!("data/six_region_link_stats.csv");
/// The two regions were chosen as the most distant regions among the four regions set.
const TWO_REGION_LINK_STATS: &[u8] = include_bytes!("data/two_region_link_stats.csv");

Expand Down Expand Up @@ -79,8 +80,8 @@ fn create_link_stats_table_with_peer_groups(
"At least 2 regions are required for inter-region network chaos."
);
assert!(
number_of_regions <= 4,
"ChaosMesh only supports simulating up to 4 regions."
number_of_regions <= 6,
"ChaosMesh only supports simulating up to 6 regions."
);

// Create the link stats table with peer groups
Expand Down Expand Up @@ -237,10 +238,23 @@ impl MultiRegionNetworkEmulationConfig {
..Default::default()
}
}

pub fn four_regions() -> Self {
Self {
link_stats_table: get_link_stats_table(FOUR_REGION_LINK_STATS),
..Default::default()
}
}

pub fn six_regions() -> Self {
Self {
link_stats_table: get_link_stats_table(SIX_REGION_LINK_STATS),
..Default::default()
}
}
}

/// A test to emulate network conditions for a multi-region setup.
#[derive(Default)]
pub struct MultiRegionNetworkEmulationTest {
network_emulation_config: MultiRegionNetworkEmulationConfig,
}
Expand All @@ -252,6 +266,18 @@ impl MultiRegionNetworkEmulationTest {
}
}

pub fn default_for_validator_count(num_validators: usize) -> Self {
if num_validators > 100 {
Self {
network_emulation_config: MultiRegionNetworkEmulationConfig::six_regions(),
}
} else {
Self {
network_emulation_config: MultiRegionNetworkEmulationConfig::four_regions(),
}
}
}

/// Creates a new SwarmNetEm to be injected via chaos. Note: network
/// emulation is only done for the validators in the swarm (and not
/// the fullnodes).
Expand Down

0 comments on commit 8a540dc

Please sign in to comment.