Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[forge] reuse existing management configmap #2132

Merged
merged 1 commit into from
Jul 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions testsuite/forge-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ struct K8sSwarm {
help = "If set, uses kubectl port-forward instead of assuming k8s DNS access"
)]
port_forward: bool,
#[structopt(
long,
help = "If set, reuse the forge testnet active in the specified namespace"
)]
reuse: bool,
#[structopt(
long,
help = "If set, keeps the forge testnet active in the specified namespace"
Expand Down Expand Up @@ -196,6 +201,7 @@ fn main() -> Result<()> {
k8s.image_tag,
k8s.base_image_tag,
k8s.port_forward,
k8s.reuse,
k8s.keep,
k8s.enable_haproxy,
)
Expand Down
2 changes: 1 addition & 1 deletion testsuite/forge-test-runner-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
- -c
- |
ulimit -n 1048576
forge test k8s-swarm --image-tag {IMAGE_TAG} --namespace {FORGE_NAMESPACE} {KEEP_ARGS} {ENABLE_HAPROXY_ARGS}
forge --suite {FORGE_TEST_SUITE} test k8s-swarm --image-tag {IMAGE_TAG} --namespace {FORGE_NAMESPACE} {REUSE_ARGS} {KEEP_ARGS} {ENABLE_HAPROXY_ARGS}
affinity:
# avoid scheduling with other forge or validator/fullnode pods
podAntiAffinity:
Expand Down
76 changes: 51 additions & 25 deletions testsuite/forge/src/backend/k8s/cluster_helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ use kube::{
client::Client as K8sClient,
Config, Error as KubeError,
};
use rand::Rng;
use serde::de::DeserializeOwned;
use serde_json::Value;
use std::{
Expand Down Expand Up @@ -398,10 +397,10 @@ pub async fn install_testnet_resources(
genesis_modules_path: Option<String>,
use_port_forward: bool,
enable_haproxy: bool,
) -> Result<(String, HashMap<PeerId, K8sNode>, HashMap<PeerId, K8sNode>)> {
) -> Result<(HashMap<PeerId, K8sNode>, HashMap<PeerId, K8sNode>)> {
assert!(base_num_validators <= MAX_NUM_VALIDATORS);

let new_era = get_new_era().unwrap();
let new_era = "fg".to_string();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: May be a better name instead of "fg", how about "forge_testing" ?

let kube_client = create_k8s_client().await;

// get deployment-specific helm values and cache it
Expand Down Expand Up @@ -485,6 +484,26 @@ pub async fn install_testnet_resources(
kube_namespace.clone(),
)?;

let (validators, fullnodes) = collect_running_nodes(
&kube_client,
kube_namespace,
base_validator_image_tag,
use_port_forward,
enable_haproxy,
)
.await?;

Ok((validators, fullnodes))
}

/// Collect the running nodes in the network into K8sNodes
pub async fn collect_running_nodes(
kube_client: &K8sClient,
kube_namespace: String,
base_validator_image_tag: String,
use_port_forward: bool,
enable_haproxy: bool,
) -> Result<(HashMap<PeerId, K8sNode>, HashMap<PeerId, K8sNode>)> {
// get all validators
let validators = get_validators(
kube_client.clone(),
Expand All @@ -497,31 +516,31 @@ pub async fn install_testnet_resources(
.unwrap();

// wait for all validator STS to spin up
wait_node_stateful_set(&kube_client, &kube_namespace, &validators).await?;
wait_node_stateful_set(kube_client, &kube_namespace, &validators).await?;

if enable_haproxy {
wait_node_haproxy(&kube_client, &kube_namespace, validators.len()).await?;
wait_node_haproxy(kube_client, &kube_namespace, validators.len()).await?;
}

// get all fullnodes
let fullnodes = get_fullnodes(
kube_client.clone(),
&base_validator_image_tag,
&new_era,
&kube_namespace,
use_port_forward,
enable_haproxy,
)
.await
.unwrap();

wait_node_stateful_set(&kube_client, &kube_namespace, &fullnodes).await?;
wait_node_stateful_set(kube_client, &kube_namespace, &fullnodes).await?;

let nodes = validators
.values()
// .chain(fullnodes.values())
.collect::<Vec<&K8sNode>>();

// start port-forward for each of the validators
if use_port_forward {
for node in nodes.iter() {
node.spawn_port_forward()?;
Expand All @@ -530,17 +549,7 @@ pub async fn install_testnet_resources(
}

nodes_healthcheck(nodes).await?;

// start port-forward for each of the validators
Ok((new_era, validators, fullnodes))
}

fn get_new_era() -> Result<String> {
// get a random new era to wipe the chain
let mut rng = rand::thread_rng();
let new_era: &str = &format!("fg{}", rng.gen::<u32>());
info!("new chain era: {}", new_era);
Ok(new_era.to_string())
Ok((validators, fullnodes))
}

pub async fn create_k8s_client() -> K8sClient {
Expand Down Expand Up @@ -593,7 +602,11 @@ fn get_helm_status(helm_release_name: &str) -> Result<Value> {
.map_err(|e| format_err!("failed to deserialize helm values: {}", e))
}

fn dump_string_to_file(file_name: String, content: String, tmp_dir: &TempDir) -> Result<String> {
pub fn dump_string_to_file(
file_name: String,
content: String,
tmp_dir: &TempDir,
) -> Result<String> {
let file_path = tmp_dir.path().join(file_name.clone());
info!("Wrote content to: {:?}", &file_path);
let mut file = File::create(file_path).expect("Could not create file in temp dir");
Expand Down Expand Up @@ -677,12 +690,25 @@ pub async fn create_management_configmap(kube_namespace: String, keep: bool) ->
..ObjectMeta::default()
},
};
configmap.create(&PostParams::default(), &config).await?;

info!(
"Created configmap {} with data {:?}",
management_configmap_name, data
);
if let Err(KubeError::Api(api_err)) = configmap.create(&PostParams::default(), &config).await {
if api_err.code == 409 {
info!(
"Configmap {} already exists, continuing with it",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for fixing this. I was meaning to put out this change but you beat me to it :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops :p well Forge needs lots of improvement -- more opportunities!

&management_configmap_name
);
} else {
bail!(
"Failed to use existing management configmap {}: {:?}",
&kube_namespace,
api_err
);
}
} else {
info!(
"Created configmap {} with data {:?}",
management_configmap_name, data
);
}

Ok(())
}
Expand Down
58 changes: 39 additions & 19 deletions testsuite/forge/src/backend/k8s/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub struct K8sFactory {
base_image_tag: String,
kube_namespace: String,
use_port_forward: bool,
reuse: bool,
keep: bool,
enable_haproxy: bool,
}
Expand All @@ -39,6 +40,7 @@ impl K8sFactory {
image_tag: String,
base_image_tag: String,
use_port_forward: bool,
reuse: bool,
keep: bool,
enable_haproxy: bool,
) -> Result<K8sFactory> {
Expand Down Expand Up @@ -66,6 +68,7 @@ impl K8sFactory {
base_image_tag,
kube_namespace,
use_port_forward,
reuse,
keep,
enable_haproxy,
})
Expand Down Expand Up @@ -100,25 +103,42 @@ impl Factory for K8sFactory {
None => None,
};

// create the forge-management configmap before installing anything
create_management_configmap(self.kube_namespace.clone(), self.keep).await?;

// try installing testnet resources, but clean up if it fails
let (_era, validators, fullnodes) = match install_testnet_resources(
self.kube_namespace.clone(),
node_num.get(),
format!("{}", init_version),
format!("{}", genesis_version),
genesis_modules_path,
self.use_port_forward,
self.enable_haproxy,
)
.await
{
Ok(res) => res,
Err(e) => {
uninstall_testnet_resources(self.kube_namespace.clone()).await?;
bail!(e);
let (validators, fullnodes) = if self.reuse {
let kube_client = create_k8s_client().await;
match collect_running_nodes(
&kube_client,
self.kube_namespace.clone(),
format!("{}", init_version),
self.use_port_forward,
self.enable_haproxy,
)
.await
{
Ok(res) => res,
Err(e) => {
bail!(e);
}
}
} else {
// create the forge-management configmap before installing anything
create_management_configmap(self.kube_namespace.clone(), self.keep).await?;
// try installing testnet resources, but clean up if it fails
match install_testnet_resources(
self.kube_namespace.clone(),
node_num.get(),
format!("{}", init_version),
format!("{}", genesis_version),
genesis_modules_path,
self.use_port_forward,
self.enable_haproxy,
)
.await
{
Ok(res) => res,
Err(e) => {
uninstall_testnet_resources(self.kube_namespace.clone()).await?;
bail!(e);
}
}
};

Expand Down
6 changes: 3 additions & 3 deletions testsuite/forge/src/backend/k8s/swarm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,6 @@ pub(crate) async fn get_validators(
pub(crate) async fn get_fullnodes(
client: K8sClient,
image_tag: &str,
era: &str,
kube_namespace: &str,
use_port_forward: bool,
enable_haproxy: bool,
Expand All @@ -328,8 +327,9 @@ pub(crate) async fn get_fullnodes(
ip = LOCALHOST.to_string();
}
let node_id = parse_node_id(&s.name).expect("error to parse node id");
// the base fullnode name is the same as that of the StatefulSet, and includes era
let fullnode_name = format!("{}-e{}", &s.name, era);
// the base fullnode name is the same as that of the StatefulSet
// TODO: get the era and fullnode group, for now ignore it
let fullnode_name = format!("aptos-node-{}-fullnode", node_id);
let node = K8sNode {
name: fullnode_name.clone(),
sts_name: fullnode_name,
Expand Down
10 changes: 7 additions & 3 deletions testsuite/run_forge.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ DEVINFRA_GRAFANA_BASE_URL="https://o11y.aptosdev.com/grafana/d/overview/overview
# forge test runner customization
FORGE_RUNNER_MODE=${FORGE_RUNNER_MODE:-k8s}
FORGE_NAMESPACE_KEEP=${FORGE_NAMESPACE_KEEP:-false}
FORGE_NAMESPACE_REUSE=${FORGE_NAMESPACE_REUSE:-false}
FORGE_ENABLE_HAPROXY=${FORGE_ENABLE_HAPROXY:-false}
FORGE_TEST_SUITE=${FORGE_TEST_SUITE:-land_blocking}

# if this script is not triggered in GHA, use a default value
[ -z "$GITHUB_RUN_ID" ] && GITHUB_RUN_ID=0
Expand All @@ -48,7 +50,7 @@ FORGE_NAMESPACE="${FORGE_NAMESPACE//[^[:alnum:]]/-}"
# use the first 64 chars only for namespace, as it is the maximum for k8s resources
FORGE_NAMESPACE=${FORGE_NAMESPACE:0:64}


[ "$FORGE_NAMESPACE_REUSE" = "true" ] && REUSE_ARGS="--reuse"
[ "$FORGE_NAMESPACE_KEEP" = "true" ] && KEEP_ARGS="--keep"
[ "$FORGE_ENABLE_HAPROXY" = "true" ] && ENABLE_HAPROXY_ARGS="--enable-haproxy"

Expand Down Expand Up @@ -89,10 +91,10 @@ if [ "$FORGE_RUNNER_MODE" = "local" ]; then
# more file descriptors for heavy txn generation
ulimit -n 1048576

cargo run -p forge-cli -- test k8s-swarm \
cargo run -p forge-cli -- --suite $FORGE_TEST_SUITE test k8s-swarm \
--image-tag $IMAGE_TAG \
--namespace $FORGE_NAMESPACE \
--port-forward $KEEP_ARGS $ENABLE_HAPROXY_ARGS | tee $FORGE_OUTPUT
--port-forward $REUSE_ARGS $KEEP_ARGS $ENABLE_HAPROXY_ARGS | tee $FORGE_OUTPUT

FORGE_EXIT_CODE=$?

Expand All @@ -114,10 +116,12 @@ elif [ "$FORGE_RUNNER_MODE" = "k8s" ]; then
echo "Forge test-runner pod Spec : ${specfile}"

sed -e "s/{FORGE_POD_NAME}/${FORGE_POD_NAME}/g" \
-e "s/{FORGE_TEST_SUITE}/${FORGE_TEST_SUITE}/g" \
-e "s/{IMAGE_TAG}/${IMAGE_TAG}/g" \
-e "s/{AWS_ACCOUNT_NUM}/${AWS_ACCOUNT_NUM}/g" \
-e "s/{AWS_REGION}/${AWS_REGION}/g" \
-e "s/{FORGE_NAMESPACE}/${FORGE_NAMESPACE}/g" \
-e "s/{REUSE_ARGS}/${REUSE_ARGS}/g" \
-e "s/{KEEP_ARGS}/${KEEP_ARGS}/g" \
-e "s/{ENABLE_HAPROXY_ARGS}/${ENABLE_HAPROXY_ARGS}/g" \
testsuite/forge-test-runner-template.yaml > ${specfile}
Expand Down