Skip to content

Commit

Permalink
[forge][chaos-mesh] emulate packets to service targets
Browse files Browse the repository at this point in the history
  • Loading branch information
ibalajiarun committed May 31, 2024
1 parent 2e0d258 commit 591ee0c
Show file tree
Hide file tree
Showing 13 changed files with 84 additions and 27 deletions.
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion terraform/helm/genesis/files/genesis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ for i in $(seq 0 $(($NUM_VALIDATORS - 1))); do
cluster=${MULTICLUSTER_DOMAIN_SUFFIXES[${index}]}
validator_host="${username}-${VALIDATOR_INTERNAL_HOST_SUFFIX}.${NAMESPACE}.svc.${cluster}:6180"
else
validator_host="${username}-${VALIDATOR_INTERNAL_HOST_SUFFIX}:6180"
validator_host="${username}-${VALIDATOR_INTERNAL_HOST_SUFFIX}.${NAMESPACE}.svc.cluster.local:6180"
fi

if [ $i -lt $NUM_VALIDATORS_WITH_LARGER_STAKE ]; then
Expand Down
21 changes: 21 additions & 0 deletions testsuite/forge/src/backend/k8s/chaos.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ impl K8sSwarm {
for group_netem in &swarm_netem.group_netems {
let source_instance_labels = self.get_instance_labels(&group_netem.source_nodes);
let target_instance_labels = self.get_instance_labels(&group_netem.target_nodes);
let service_targets = self.get_service_targets(&group_netem.target_nodes);

network_chaos_specs.push(format!(
include_str!(NETEM_CHAOS_TEMPLATE!()),
Expand All @@ -180,6 +181,7 @@ impl K8sSwarm {
instance_labels = &source_instance_labels,
target_instance_labels = &target_instance_labels,
rate = group_netem.rate_in_mbps,
service_targets = &service_targets,
));
}

Expand Down Expand Up @@ -285,4 +287,23 @@ impl K8sSwarm {
INVALID_NODE_STRING
}
}

fn get_service_name(&self, node: &AccountAddress) -> Option<String> {
if let Some(validator) = self.validator(*node) {
validator.service_name()
} else if let Some(fullnode) = self.full_node(*node) {
fullnode.service_name()
} else {
// TODO: should we throw an error here instead of failing silently?
None
}
}

pub(crate) fn get_service_targets(&self, target_nodes: &[AccountAddress]) -> String {
target_nodes
.iter()
.filter_map(|node| self.get_service_name(node))
.collect::<Vec<_>>()
.join(",")
}
}
4 changes: 3 additions & 1 deletion testsuite/forge/src/backend/k8s/chaos/netem.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ spec:
rate: "{rate}mbps"
limit: 20971520 # placeholder value. not supported by tc netem
buffer: 10000 # placeholder value. not supported by tc netem
direction: both
direction: to
target:
selector:
namespaces:
- {namespace}
expressionSelectors:
- {{ key: app.kubernetes.io/instance, operator: In, values: [{target_instance_labels}] }}
mode: all
# This is required to ensure that the network chaos is applied when using service IPs instead of pod IPs
externalTargets: [{service_targets}]
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,20 @@ fullnode:
# force enable the telemetry service to try to send telemetry
force_enable_telemetry: true

# Make all services internal NodePort and open all ports
# NodePort is required for ChaosMesh to function correctly: https://github.com/chaos-mesh/chaos-mesh/issues/3278#issuecomment-1134248492
service:
validator:
external:
type: "NodePort"
type: "ClusterIP"
internal:
type: "NodePort"
type: "ClusterIP"
enableRestApi: true
enableMetricsPort: true

fullnode:
external:
type: "NodePort"
type: "ClusterIP"
internal:
type: "NodePort"
type: "ClusterIP"
enableRestApi: true
enableMetricsPort: true

Expand Down
4 changes: 4 additions & 0 deletions testsuite/forge/src/backend/k8s/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,10 @@ impl Node for K8sNode {
)
.await
}

fn service_name(&self) -> Option<String> {
Some(self.service_name.clone())
}
}

impl Validator for K8sNode {}
Expand Down
4 changes: 4 additions & 0 deletions testsuite/forge/src/backend/local/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,10 @@ impl Node for LocalNode {
fn expose_metric(&self) -> Result<u64> {
Ok(0)
}

fn service_name(&self) -> Option<String> {
None
}
}

impl Validator for LocalNode {}
Expand Down
2 changes: 2 additions & 0 deletions testsuite/forge/src/interface/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ pub trait Node: Send + Sync {
fn counter(&self, counter: &str, port: u64) -> Result<f64>;

fn expose_metric(&self) -> Result<u64>;

fn service_name(&self) -> Option<String>;
}

/// Trait used to represent a running Validator
Expand Down
45 changes: 30 additions & 15 deletions testsuite/testcases/src/multi_region_network_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,22 +124,37 @@ impl InterRegionNetEmConfig {
let (from_region, from_chunk, stats) = &comb[0];
let (to_region, to_chunk, _) = &comb[1];

let (bandwidth, latency) = stats.get(to_region).unwrap();
let netem = GroupNetEm {
name: format!("{}-to-{}-netem", from_region, to_region),
source_nodes: from_chunk.to_vec(),
target_nodes: to_chunk.to_vec(),
delay_latency_ms: *latency as u64,
delay_jitter_ms: self.delay_jitter_ms,
delay_correlation_percentage: self.delay_correlation_percentage,
loss_percentage: self.loss_percentage,
loss_correlation_percentage: self.loss_correlation_percentage,
rate_in_mbps: *bandwidth / 1e6 as u64,
};
info!("inter-region netem {:?}", netem);

netem
let (bandwidth, rtt_latency) = stats.get(to_region).unwrap();
let hop_latency = rtt_latency / 2.0;
let netems = [
GroupNetEm {
name: format!("{}-to-{}-netem", from_region, to_region),
source_nodes: from_chunk.to_vec(),
target_nodes: to_chunk.to_vec(),
delay_latency_ms: hop_latency as u64,
delay_jitter_ms: self.delay_jitter_ms,
delay_correlation_percentage: self.delay_correlation_percentage,
loss_percentage: self.loss_percentage,
loss_correlation_percentage: self.loss_correlation_percentage,
rate_in_mbps: *bandwidth / 1e6 as u64,
},
GroupNetEm {
name: format!("{}-to-{}-netem", to_region, from_region),
source_nodes: to_chunk.to_vec(),
target_nodes: from_chunk.to_vec(),
delay_latency_ms: hop_latency as u64,
delay_jitter_ms: self.delay_jitter_ms,
delay_correlation_percentage: self.delay_correlation_percentage,
loss_percentage: self.loss_percentage,
loss_correlation_percentage: self.loss_correlation_percentage,
rate_in_mbps: *bandwidth / 1e6 as u64,
},
];
info!("inter-region netem {:?}", netems);

netems
})
.flatten()
.collect();

group_netems
Expand Down
2 changes: 2 additions & 0 deletions testsuite/testcases/src/network_bandwidth_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ use aptos_forge::{
GroupNetworkBandwidth, NetworkContext, NetworkTest, SwarmChaos, SwarmNetworkBandwidth, Test,
};

/// This is deprecated. Use [crate::multi_region_network_test::MultiRegionNetworkEmulationTest] instead
#[deprecated]
pub struct NetworkBandwidthTest;

// Bandwidth
Expand Down
2 changes: 2 additions & 0 deletions testsuite/testcases/src/network_loss_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
use crate::{LoadDestination, NetworkLoadTest};
use aptos_forge::{NetworkContext, NetworkTest, SwarmChaos, SwarmNetworkLoss, Test};

/// This is deprecated. Use [crate::multi_region_network_test::MultiRegionNetworkEmulationTest] instead
#[deprecated]
pub struct NetworkLossTest;

// Loss parameters
Expand Down
2 changes: 2 additions & 0 deletions testsuite/testcases/src/network_partition_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
use crate::{LoadDestination, NetworkLoadTest};
use aptos_forge::{NetworkContext, NetworkTest, SwarmChaos, SwarmNetworkPartition, Test};

/// This is deprecated. Use [crate::multi_region_network_test::MultiRegionNetworkEmulationTest] instead
#[deprecated]
pub struct NetworkPartitionTest;

// Partition
Expand Down
7 changes: 6 additions & 1 deletion testsuite/testcases/src/three_region_simulation_test.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// Copyright © Aptos Foundation
// SPDX-License-Identifier: Apache-2.0

use crate::{LoadDestination, NetworkLoadTest};
use crate::{
multi_region_network_test::MultiRegionNetworkEmulationTest, LoadDestination, NetworkLoadTest,
};
use aptos_forge::{
GroupNetworkBandwidth, GroupNetworkDelay, NetworkContext, NetworkTest, Swarm, SwarmChaos,
SwarmNetworkBandwidth, SwarmNetworkDelay, Test,
Expand All @@ -22,6 +24,9 @@ impl Test for ThreeRegionSameCloudSimulationTest {
/// 2. Each region has minimal network delay amongst its nodes
/// 3. Each region has a network delay to the other two regions, as estimated by https://www.cloudping.co/grid
/// 4. Currently simulating a 50 percentile network delay between us-west <--> af-south <--> eu-north
///
/// This is deprecated and flawed. Use [crate::multi_region_network_test::MultiRegionNetworkEmulationTest] instead
#[deprecated]
fn create_three_region_swarm_network_delay(swarm: &dyn Swarm) -> SwarmNetworkDelay {
let all_validators = swarm.validators().map(|v| v.peer_id()).collect::<Vec<_>>();

Expand Down

0 comments on commit 591ee0c

Please sign in to comment.