Skip to content

Commit

Permalink
[draft] surface counters in forge, to be able to assert them in succe…
Browse files Browse the repository at this point in the history
…ss criteria
  • Loading branch information
igor-aptos committed Aug 1, 2023
1 parent b422401 commit fee3b95
Show file tree
Hide file tree
Showing 12 changed files with 407 additions and 170 deletions.
86 changes: 40 additions & 46 deletions testsuite/forge-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ use anyhow::{format_err, Context, Result};
use aptos_config::config::{ChainHealthBackoffValues, ConsensusConfig, PipelineBackpressureValues};
use aptos_forge::{
args::TransactionTypeArg,
success_criteria::{LatencyType, StateProgressThreshold, SuccessCriteria},
success_criteria::{
LatencyBreakdownThreshold, LatencyType, StateProgressThreshold, SuccessCriteria,
},
system_metrics::{MetricsThreshold, SystemMetricsThreshold},
ForgeConfig, Options, *,
};
Expand Down Expand Up @@ -41,7 +43,7 @@ use aptos_testcases::{
validator_reboot_stress_test::ValidatorRebootStressTest,
CompositeNetworkTest,
};
use clap::{Parser, Subcommand};
use clap::{Parser, Subcommand, __derive_refs::once_cell::sync::Lazy};
use futures::stream::{FuturesUnordered, StreamExt};
use rand::{rngs::ThreadRng, seq::SliceRandom, Rng};
use std::{
Expand Down Expand Up @@ -216,6 +218,24 @@ struct Resize {
enable_haproxy: bool,
}

// common metrics thresholds:
static SYSTEM_12_CORES_5GB_THRESHOLD: Lazy<SystemMetricsThreshold> = Lazy::new(|| {
SystemMetricsThreshold::new(
// Check that we don't use more than 12 CPU cores for 30% of the time.
MetricsThreshold::new(12.0, 30),
// Check that we don't use more than 5 GB of memory for 30% of the time.
MetricsThreshold::new_gb(5.0, 30),
)
});
static SYSTEM_12_CORES_10GB_THRESHOLD: Lazy<SystemMetricsThreshold> = Lazy::new(|| {
SystemMetricsThreshold::new(
// Check that we don't use more than 12 CPU cores for 30% of the time.
MetricsThreshold::new(12.0, 30),
// Check that we don't use more than 5 GB of memory for 30% of the time.
MetricsThreshold::new_gb(10.0, 30),
)
});

/// Make an easy to remember random namespace for your testnet
fn random_namespace<R: Rng>(dictionary: Vec<String>, rng: &mut R) -> Result<String> {
// Pick four random words
Expand Down Expand Up @@ -679,12 +699,7 @@ fn twin_validator_test() -> ForgeConfig {
SuccessCriteria::new(5500)
.add_no_restarts()
.add_wait_for_catchup_s(60)
.add_system_metrics_threshold(SystemMetricsThreshold::new(
// Check that we don't use more than 12 CPU cores for 30% of the time.
MetricsThreshold::new(12, 30),
// Check that we don't use more than 5 GB of memory for 30% of the time.
MetricsThreshold::new(5 * 1024 * 1024 * 1024, 30),
))
.add_system_metrics_threshold(SYSTEM_12_CORES_5GB_THRESHOLD.clone())
.add_chain_progress(StateProgressThreshold {
max_no_progress_secs: 10.0,
max_round_gap: 4,
Expand Down Expand Up @@ -958,9 +973,9 @@ fn graceful_overload() -> ForgeConfig {
.add_wait_for_catchup_s(120)
.add_system_metrics_threshold(SystemMetricsThreshold::new(
// Check that we don't use more than 12 CPU cores for 30% of the time.
MetricsThreshold::new(12, 40),
MetricsThreshold::new(12.0, 40),
// Check that we don't use more than 5 GB of memory for 30% of the time.
MetricsThreshold::new(5 * 1024 * 1024 * 1024, 30),
MetricsThreshold::new_gb(5.0, 30),
))
.add_latency_threshold(10.0, LatencyType::P50)
.add_latency_threshold(30.0, LatencyType::P90)
Expand Down Expand Up @@ -1009,9 +1024,9 @@ fn realistic_env_graceful_overload() -> ForgeConfig {
.add_system_metrics_threshold(SystemMetricsThreshold::new(
// overload test uses more CPUs than others, so increase the limit
// Check that we don't use more than 18 CPU cores for 30% of the time.
MetricsThreshold::new(18, 40),
MetricsThreshold::new(18.0, 40),
// Check that we don't use more than 5 GB of memory for 30% of the time.
MetricsThreshold::new(5 * 1024 * 1024 * 1024, 30),
MetricsThreshold::new_gb(5.0, 30),
))
.add_latency_threshold(10.0, LatencyType::P50)
.add_latency_threshold(30.0, LatencyType::P90)
Expand Down Expand Up @@ -1417,12 +1432,7 @@ fn validators_join_and_leave() -> ForgeConfig {
SuccessCriteria::new(5000)
.add_no_restarts()
.add_wait_for_catchup_s(240)
.add_system_metrics_threshold(SystemMetricsThreshold::new(
// Check that we don't use more than 12 CPU cores for 30% of the time.
MetricsThreshold::new(12, 30),
// Check that we don't use more than 10 GB of memory for 30% of the time.
MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
))
.add_system_metrics_threshold(SYSTEM_12_CORES_10GB_THRESHOLD.clone())
.add_chain_progress(StateProgressThreshold {
max_no_progress_secs: 10.0,
max_round_gap: 4,
Expand Down Expand Up @@ -1452,12 +1462,7 @@ fn land_blocking_test_suite(duration: Duration) -> ForgeConfig {
// Give at least 60s for catchup, give 10% of the run for longer durations.
(duration.as_secs() / 10).max(60),
)
.add_system_metrics_threshold(SystemMetricsThreshold::new(
// Check that we don't use more than 12 CPU cores for 30% of the time.
MetricsThreshold::new(12, 30),
// Check that we don't use more than 10 GB of memory for 30% of the time.
MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
))
.add_system_metrics_threshold(SYSTEM_12_CORES_10GB_THRESHOLD.clone())
.add_chain_progress(StateProgressThreshold {
max_no_progress_secs: 10.0,
max_round_gap: 4,
Expand Down Expand Up @@ -1519,12 +1524,16 @@ fn realistic_env_max_load_test(
)
.add_system_metrics_threshold(SystemMetricsThreshold::new(
// Check that we don't use more than 12 CPU cores for 30% of the time.
MetricsThreshold::new(14, max_cpu_threshold),
MetricsThreshold::new(14.0, max_cpu_threshold),
// Check that we don't use more than 10 GB of memory for 30% of the time.
MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
MetricsThreshold::new_gb(10.0, 30),
))
.add_latency_threshold(3.0, LatencyType::P50)
.add_latency_threshold(5.0, LatencyType::P90)
.add_latency_breakdown_threshold(LatencyBreakdownThreshold {
block_create_to_ordered_threshold: MetricsThreshold::new(1.5, 0),
block_create_to_commit_threshold: MetricsThreshold::new(2.5, 0),
})
.add_chain_progress(StateProgressThreshold {
max_no_progress_secs: 10.0,
max_round_gap: 4,
Expand Down Expand Up @@ -1580,9 +1589,9 @@ fn realistic_network_tuned_for_throughput_test() -> ForgeConfig {
// Tuned for throughput uses more cores than regular tests,
// as it achieves higher throughput.
// Check that we don't use more than 14 CPU cores for 30% of the time.
MetricsThreshold::new(14, 30),
MetricsThreshold::new(14.0, 30),
// Check that we don't use more than 10 GB of memory for 30% of the time.
MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
MetricsThreshold::new_gb(10.0, 30),
))
.add_chain_progress(StateProgressThreshold {
max_no_progress_secs: 10.0,
Expand Down Expand Up @@ -1612,12 +1621,7 @@ fn chaos_test_suite(duration: Duration) -> ForgeConfig {
},
)
.add_no_restarts()
.add_system_metrics_threshold(SystemMetricsThreshold::new(
// Check that we don't use more than 12 CPU cores for 30% of the time.
MetricsThreshold::new(12, 30),
// Check that we don't use more than 5 GB of memory for 30% of the time.
MetricsThreshold::new(5 * 1024 * 1024 * 1024, 30),
)),
.add_system_metrics_threshold(SYSTEM_12_CORES_5GB_THRESHOLD.clone()),
)
}

Expand Down Expand Up @@ -1791,12 +1795,7 @@ fn quorum_store_reconfig_enable_test() -> ForgeConfig {
SuccessCriteria::new(5000)
.add_no_restarts()
.add_wait_for_catchup_s(240)
.add_system_metrics_threshold(SystemMetricsThreshold::new(
// Check that we don't use more than 12 CPU cores for 30% of the time.
MetricsThreshold::new(12, 30),
// Check that we don't use more than 10 GB of memory for 30% of the time.
MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
))
.add_system_metrics_threshold(SYSTEM_12_CORES_10GB_THRESHOLD.clone())
.add_chain_progress(StateProgressThreshold {
max_no_progress_secs: 10.0,
max_round_gap: 4,
Expand Down Expand Up @@ -1863,12 +1862,7 @@ fn multiregion_benchmark_test() -> ForgeConfig {
// Give at least 60s for catchup, give 10% of the run for longer durations.
180,
)
.add_system_metrics_threshold(SystemMetricsThreshold::new(
// Check that we don't use more than 12 CPU cores for 30% of the time.
MetricsThreshold::new(12, 30),
// Check that we don't use more than 10 GB of memory for 30% of the time.
MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
))
.add_system_metrics_threshold(SYSTEM_12_CORES_10GB_THRESHOLD.clone())
.add_chain_progress(StateProgressThreshold {
max_no_progress_secs: 10.0,
max_round_gap: 4,
Expand Down
59 changes: 48 additions & 11 deletions testsuite/forge/src/backend/k8s/prometheus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,23 @@
// SPDX-License-Identifier: Apache-2.0

use crate::{create_k8s_client, K8sApi, ReadWrite, Result};
use anyhow::bail;
use again::RetryPolicy;
use anyhow::{anyhow, bail};
use aptos_logger::info;
use k8s_openapi::api::core::v1::Secret;
use prometheus_http_query::{response::PromqlResult, Client as PrometheusClient};
use once_cell::sync::Lazy;
use prometheus_http_query::{
response::{PromqlResult, Sample},
Client as PrometheusClient,
};
use reqwest::{header, Client as HttpClient};
use std::{collections::BTreeMap, sync::Arc};
use std::{collections::BTreeMap, sync::Arc, time::Duration};

static PROMETHEUS_RETRY_POLICY: Lazy<RetryPolicy> = Lazy::new(|| {
RetryPolicy::exponential(Duration::from_millis(125))
.with_max_retries(3)
.with_jitter(true)
});

pub async fn get_prometheus_client() -> Result<PrometheusClient> {
// read from the environment
Expand Down Expand Up @@ -91,7 +102,7 @@ async fn create_prometheus_client_from_environment(

pub fn construct_query_with_extra_labels(
query: &str,
labels_map: BTreeMap<String, String>,
labels_map: &BTreeMap<String, String>,
) -> String {
// edit the query string to insert swarm metadata
let mut new_query = query.to_string();
Expand Down Expand Up @@ -123,13 +134,39 @@ pub async fn query_with_metadata(
query: &str,
time: Option<i64>,
timeout: Option<i64>,
labels_map: BTreeMap<String, String>,
labels_map: &BTreeMap<String, String>,
) -> Result<PromqlResult> {
let new_query = construct_query_with_extra_labels(query, labels_map);
match prom_client.query(&new_query, time, timeout).await {
Ok(r) => Ok(r),
Err(e) => bail!(e),
}
let new_query_ref = &new_query;
PROMETHEUS_RETRY_POLICY
.retry(move || prom_client.query(new_query_ref, time, timeout))
.await
.map_err(|e| anyhow!("Failed to query prometheus for {}: {}", query, e))
}

pub async fn query_range_with_metadata(
prom_client: &PrometheusClient,
query: &str,
start_time: i64,
end_time: i64,
internal_secs: f64,
timeout: Option<i64>,
labels_map: &BTreeMap<String, String>,
) -> Result<Vec<Sample>> {
let new_query = construct_query_with_extra_labels(query, labels_map);
let new_query_ref = &new_query;
let r = PROMETHEUS_RETRY_POLICY
.retry(move || {
prom_client.query_range(new_query_ref, start_time, end_time, internal_secs, timeout)
})
.await
.map_err(|e| anyhow!("Failed to query prometheus for {}: {}", query, e))?;
Ok(r.as_range()
.ok_or_else(|| anyhow!("Failed to get range from prometheus response"))?
.first()
.ok_or_else(|| anyhow!("Empty range vector returned from prometheus"))?
.samples()
.to_vec())
}

#[cfg(test)]
Expand Down Expand Up @@ -259,7 +296,7 @@ mod tests {
labels_map.insert("a".to_string(), "a".to_string());
labels_map.insert("some_label".to_string(), "blabla".to_string());
let expected_query = r#"aptos_connections{a="a",some_label="blabla"}"#;
let new_query = construct_query_with_extra_labels(original_query, labels_map);
let new_query = construct_query_with_extra_labels(original_query, &labels_map);
assert_eq!(expected_query, new_query);

// test when existing labels
Expand All @@ -268,7 +305,7 @@ mod tests {
labels_map.insert("a".to_string(), "a".to_string());
labels_map.insert("some_label".to_string(), "blabla".to_string());
let expected_query = r#"aptos_connections{a="a",some_label="blabla",abc="123",def="456"}"#;
let new_query = construct_query_with_extra_labels(original_query, labels_map);
let new_query = construct_query_with_extra_labels(original_query, &labels_map);
assert_eq!(expected_query, new_query);
}
}
33 changes: 30 additions & 3 deletions testsuite/forge/src/backend/k8s/swarm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::{
get_free_port, get_stateful_set_image, install_public_fullnode,
interface::system_metrics::{query_prometheus_system_metrics, SystemMetricsThreshold},
node::K8sNode,
prometheus::{self, query_with_metadata},
prometheus::{self, query_range_with_metadata, query_with_metadata},
query_sequence_number, set_stateful_set_image_tag, uninstall_testnet_resources, ChainInfo,
FullNode, K8sApi, Node, Result, Swarm, SwarmChaos, Validator, Version, HAPROXY_SERVICE_SUFFIX,
REST_API_HAPROXY_SERVICE_PORT, REST_API_SERVICE_PORT,
Expand All @@ -29,7 +29,10 @@ use kube::{
api::{Api, ListParams},
client::Client as K8sClient,
};
use prometheus_http_query::{response::PromqlResult, Client as PrometheusClient};
use prometheus_http_query::{
response::{PromqlResult, Sample},
Client as PrometheusClient,
};
use regex::Regex;
use std::{
collections::{BTreeMap, HashMap, HashSet},
Expand Down Expand Up @@ -402,7 +405,31 @@ impl Swarm for K8sSwarm {
if let Some(c) = &self.prom_client {
let mut labels_map = BTreeMap::new();
labels_map.insert("namespace".to_string(), self.kube_namespace.clone());
return query_with_metadata(c, query, time, timeout, labels_map).await;
return query_with_metadata(c, query, time, timeout, &labels_map).await;
}
bail!("No prom client");
}

async fn query_range_metrics(
&self,
query: &str,
start_time: i64,
end_time: i64,
timeout: Option<i64>,
) -> Result<Vec<Sample>> {
if let Some(c) = &self.prom_client {
let mut labels_map = BTreeMap::new();
labels_map.insert("namespace".to_string(), self.kube_namespace.clone());
return query_range_with_metadata(
c,
query,
start_time,
end_time,
30.0,
timeout,
&labels_map,
)
.await;
}
bail!("No prom client");
}
Expand Down
12 changes: 11 additions & 1 deletion testsuite/forge/src/backend/local/swarm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use aptos_sdk::{
PeerId,
},
};
use prometheus_http_query::response::PromqlResult;
use prometheus_http_query::response::{PromqlResult, Sample};
use std::{
collections::HashMap,
fs,
Expand Down Expand Up @@ -628,6 +628,16 @@ impl Swarm for LocalSwarm {
todo!()
}

async fn query_range_metrics(
&self,
_query: &str,
_start_time: i64,
_end_time: i64,
_timeout: Option<i64>,
) -> Result<Vec<Sample>> {
todo!()
}

async fn ensure_healthy_system_metrics(
&mut self,
_start_time: i64,
Expand Down
4 changes: 3 additions & 1 deletion testsuite/forge/src/interface/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

use super::Test;
use crate::{
success_criteria::{SuccessCriteria, SuccessCriteriaChecker},
success_criteria::{LatencyBreakdown, SuccessCriteria, SuccessCriteriaChecker},
CoreContext, Result, Swarm, TestReport,
};
use aptos_transaction_emitter_lib::{EmitJobRequest, TxnStats};
Expand Down Expand Up @@ -61,6 +61,7 @@ impl<'t> NetworkContext<'t> {
&mut self,
stats: &TxnStats,
window: Duration,
latency_breakdown: &LatencyBreakdown,
start_time: i64,
end_time: i64,
start_version: u64,
Expand All @@ -73,6 +74,7 @@ impl<'t> NetworkContext<'t> {
self.report,
stats,
window,
latency_breakdown,
start_time,
end_time,
start_version,
Expand Down
Loading

0 comments on commit fee3b95

Please sign in to comment.