diff --git a/Cargo.lock b/Cargo.lock index 31a2e67a1cda8..35da3364f4890 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9172,9 +9172,9 @@ dependencies = [ [[package]] name = "indicatif" -version = "0.17.8" +version = "0.17.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" +checksum = "fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25" dependencies = [ "console", "instant", @@ -14051,7 +14051,7 @@ version = "0.39.0" source = "git+https://github.com/banool/self_update.git?rev=8306158ad0fd5b9d4766a3c6bf967e7ef0ea5c4b#8306158ad0fd5b9d4766a3c6bf967e7ef0ea5c4b" dependencies = [ "hyper", - "indicatif 0.17.8", + "indicatif 0.17.7", "log", "quick-xml 0.23.1", "regex", diff --git a/testsuite/forge/src/backend/k8s/chaos.rs b/testsuite/forge/src/backend/k8s/chaos.rs index 3dfb3b3a4ef48..03306bf9fdd2b 100644 --- a/testsuite/forge/src/backend/k8s/chaos.rs +++ b/testsuite/forge/src/backend/k8s/chaos.rs @@ -167,6 +167,7 @@ impl K8sSwarm { for group_netem in &swarm_netem.group_netems { let source_instance_labels = self.get_instance_labels(&group_netem.source_nodes); let target_instance_labels = self.get_instance_labels(&group_netem.target_nodes); + let service_targets = self.get_service_targets(&group_netem.target_nodes); network_chaos_specs.push(format!( include_str!(NETEM_CHAOS_TEMPLATE!()), @@ -180,6 +181,7 @@ impl K8sSwarm { instance_labels = &source_instance_labels, target_instance_labels = &target_instance_labels, rate = group_netem.rate_in_mbps, + service_targets = &service_targets, )); } @@ -285,4 +287,23 @@ impl K8sSwarm { INVALID_NODE_STRING } } + + fn get_service_name(&self, node: &AccountAddress) -> Option { + if let Some(validator) = self.validator(*node) { + validator.service_name() + } else if let Some(fullnode) = self.full_node(*node) { + fullnode.service_name() + } else { + // TODO: should we throw an error here instead of failing silently? + None + } + } + + pub(crate) fn get_service_targets(&self, target_nodes: &[AccountAddress]) -> String { + target_nodes + .iter() + .filter_map(|node| self.get_service_name(node)) + .collect::>() + .join(",") + } } diff --git a/testsuite/forge/src/backend/k8s/chaos/netem.yaml b/testsuite/forge/src/backend/k8s/chaos/netem.yaml index 1957df33f898e..686f5a9e94510 100644 --- a/testsuite/forge/src/backend/k8s/chaos/netem.yaml +++ b/testsuite/forge/src/backend/k8s/chaos/netem.yaml @@ -22,7 +22,7 @@ spec: rate: "{rate}mbps" limit: 20971520 # placeholder value. not supported by tc netem buffer: 10000 # placeholder value. not supported by tc netem - direction: both + direction: to target: selector: namespaces: @@ -30,3 +30,5 @@ spec: expressionSelectors: - {{ key: app.kubernetes.io/instance, operator: In, values: [{target_instance_labels}] }} mode: all + # This is required to ensure that the network chaos is applied when using service IPs instead of pod IPs + externalTargets: [{service_targets}] diff --git a/testsuite/forge/src/backend/k8s/helm-values/aptos-node-default-values.yaml b/testsuite/forge/src/backend/k8s/helm-values/aptos-node-default-values.yaml index 2cebfd18189d3..5f30786f111a2 100644 --- a/testsuite/forge/src/backend/k8s/helm-values/aptos-node-default-values.yaml +++ b/testsuite/forge/src/backend/k8s/helm-values/aptos-node-default-values.yaml @@ -17,22 +17,20 @@ fullnode: # force enable the telemetry service to try to send telemetry force_enable_telemetry: true -# Make all services internal NodePort and open all ports -# NodePort is required for ChaosMesh to function correctly: https://github.com/chaos-mesh/chaos-mesh/issues/3278#issuecomment-1134248492 service: validator: external: - type: "NodePort" + type: "ClusterIP" internal: - type: "NodePort" + type: "ClusterIP" enableRestApi: true enableMetricsPort: true fullnode: external: - type: "NodePort" + type: "ClusterIP" internal: - type: "NodePort" + type: "ClusterIP" enableRestApi: true enableMetricsPort: true diff --git a/testsuite/forge/src/backend/k8s/node.rs b/testsuite/forge/src/backend/k8s/node.rs index 8fd0fec22badf..582da1ae89165 100644 --- a/testsuite/forge/src/backend/k8s/node.rs +++ b/testsuite/forge/src/backend/k8s/node.rs @@ -268,6 +268,10 @@ impl Node for K8sNode { ) .await } + + fn service_name(&self) -> Option { + Some(self.service_name.clone()) + } } impl Validator for K8sNode {} diff --git a/testsuite/forge/src/backend/local/node.rs b/testsuite/forge/src/backend/local/node.rs index 9487e40180e60..a2384d0febd2b 100644 --- a/testsuite/forge/src/backend/local/node.rs +++ b/testsuite/forge/src/backend/local/node.rs @@ -367,6 +367,10 @@ impl Node for LocalNode { fn expose_metric(&self) -> Result { Ok(0) } + + fn service_name(&self) -> Option { + None + } } impl Validator for LocalNode {} diff --git a/testsuite/forge/src/interface/node.rs b/testsuite/forge/src/interface/node.rs index 02dd405b97c59..e4dbcf24edfba 100644 --- a/testsuite/forge/src/interface/node.rs +++ b/testsuite/forge/src/interface/node.rs @@ -72,6 +72,8 @@ pub trait Node: Send + Sync { fn counter(&self, counter: &str, port: u64) -> Result; fn expose_metric(&self) -> Result; + + fn service_name(&self) -> Option; } /// Trait used to represent a running Validator diff --git a/testsuite/testcases/src/multi_region_network_test.rs b/testsuite/testcases/src/multi_region_network_test.rs index a4a60a062f818..9e44a837e5d4e 100644 --- a/testsuite/testcases/src/multi_region_network_test.rs +++ b/testsuite/testcases/src/multi_region_network_test.rs @@ -120,25 +120,39 @@ impl InterRegionNetEmConfig { let group_netems: Vec = peer_groups .iter() .combinations(2) - .map(|comb| { + .flat_map(|comb| { let (from_region, from_chunk, stats) = &comb[0]; let (to_region, to_chunk, _) = &comb[1]; - let (bandwidth, latency) = stats.get(to_region).unwrap(); - let netem = GroupNetEm { - name: format!("{}-to-{}-netem", from_region, to_region), - source_nodes: from_chunk.to_vec(), - target_nodes: to_chunk.to_vec(), - delay_latency_ms: *latency as u64, - delay_jitter_ms: self.delay_jitter_ms, - delay_correlation_percentage: self.delay_correlation_percentage, - loss_percentage: self.loss_percentage, - loss_correlation_percentage: self.loss_correlation_percentage, - rate_in_mbps: *bandwidth / 1e6 as u64, - }; - info!("inter-region netem {:?}", netem); - - netem + let (bandwidth, rtt_latency) = stats.get(to_region).unwrap(); + let hop_latency = rtt_latency / 2.0; + let netems = [ + GroupNetEm { + name: format!("{}-to-{}-netem", from_region, to_region), + source_nodes: from_chunk.to_vec(), + target_nodes: to_chunk.to_vec(), + delay_latency_ms: hop_latency as u64, + delay_jitter_ms: self.delay_jitter_ms, + delay_correlation_percentage: self.delay_correlation_percentage, + loss_percentage: self.loss_percentage, + loss_correlation_percentage: self.loss_correlation_percentage, + rate_in_mbps: *bandwidth / 1e6 as u64, + }, + GroupNetEm { + name: format!("{}-to-{}-netem", to_region, from_region), + source_nodes: to_chunk.to_vec(), + target_nodes: from_chunk.to_vec(), + delay_latency_ms: hop_latency as u64, + delay_jitter_ms: self.delay_jitter_ms, + delay_correlation_percentage: self.delay_correlation_percentage, + loss_percentage: self.loss_percentage, + loss_correlation_percentage: self.loss_correlation_percentage, + rate_in_mbps: *bandwidth / 1e6 as u64, + }, + ]; + info!("inter-region netem {:?}", netems); + + netems }) .collect(); diff --git a/testsuite/testcases/src/network_bandwidth_test.rs b/testsuite/testcases/src/network_bandwidth_test.rs index eb8f701565440..ba70db54942c9 100644 --- a/testsuite/testcases/src/network_bandwidth_test.rs +++ b/testsuite/testcases/src/network_bandwidth_test.rs @@ -6,6 +6,7 @@ use aptos_forge::{ GroupNetworkBandwidth, NetworkContext, NetworkTest, SwarmChaos, SwarmNetworkBandwidth, Test, }; +/// This is deprecated. Use [crate::multi_region_network_test::MultiRegionNetworkEmulationTest] instead pub struct NetworkBandwidthTest; // Bandwidth diff --git a/testsuite/testcases/src/network_loss_test.rs b/testsuite/testcases/src/network_loss_test.rs index f7f175555f7aa..7fd83aa344994 100644 --- a/testsuite/testcases/src/network_loss_test.rs +++ b/testsuite/testcases/src/network_loss_test.rs @@ -4,6 +4,7 @@ use crate::{LoadDestination, NetworkLoadTest}; use aptos_forge::{NetworkContext, NetworkTest, SwarmChaos, SwarmNetworkLoss, Test}; +/// This is deprecated. Use [crate::multi_region_network_test::MultiRegionNetworkEmulationTest] instead pub struct NetworkLossTest; // Loss parameters diff --git a/testsuite/testcases/src/network_partition_test.rs b/testsuite/testcases/src/network_partition_test.rs index 41659cf5c8468..8846b73a94ed6 100644 --- a/testsuite/testcases/src/network_partition_test.rs +++ b/testsuite/testcases/src/network_partition_test.rs @@ -4,6 +4,7 @@ use crate::{LoadDestination, NetworkLoadTest}; use aptos_forge::{NetworkContext, NetworkTest, SwarmChaos, SwarmNetworkPartition, Test}; +/// This is deprecated. Use [crate::multi_region_network_test::MultiRegionNetworkEmulationTest] instead pub struct NetworkPartitionTest; // Partition diff --git a/testsuite/testcases/src/three_region_simulation_test.rs b/testsuite/testcases/src/three_region_simulation_test.rs index 916bbcc86c322..c7d97fd3ecf29 100644 --- a/testsuite/testcases/src/three_region_simulation_test.rs +++ b/testsuite/testcases/src/three_region_simulation_test.rs @@ -22,6 +22,8 @@ impl Test for ThreeRegionSameCloudSimulationTest { /// 2. Each region has minimal network delay amongst its nodes /// 3. Each region has a network delay to the other two regions, as estimated by https://www.cloudping.co/grid /// 4. Currently simulating a 50 percentile network delay between us-west <--> af-south <--> eu-north +/// +/// This is deprecated and flawed. Use [crate::multi_region_network_test::MultiRegionNetworkEmulationTest] instead fn create_three_region_swarm_network_delay(swarm: &dyn Swarm) -> SwarmNetworkDelay { let all_validators = swarm.validators().map(|v| v.peer_id()).collect::>();