diff --git a/testsuite/forge-cli/src/main.rs b/testsuite/forge-cli/src/main.rs index 00c4834652020..ebf5072e328d3 100644 --- a/testsuite/forge-cli/src/main.rs +++ b/testsuite/forge-cli/src/main.rs @@ -6,6 +6,7 @@ use anyhow::{format_err, Context, Result}; use aptos_config::config::{ChainHealthBackoffValues, ConsensusConfig, PipelineBackpressureValues}; use aptos_forge::{ args::TransactionTypeArg, + prometheus_metrics::LatencyBreakdownSlice, success_criteria::{ LatencyBreakdownThreshold, LatencyType, MetricsThreshold, StateProgressThreshold, SuccessCriteria, SystemMetricsThreshold, @@ -231,7 +232,7 @@ static SYSTEM_12_CORES_10GB_THRESHOLD: Lazy = Lazy::new( SystemMetricsThreshold::new( // Check that we don't use more than 12 CPU cores for 30% of the time. MetricsThreshold::new(12.0, 30), - // Check that we don't use more than 5 GB of memory for 30% of the time. + // Check that we don't use more than 10 GB of memory for 30% of the time. MetricsThreshold::new_gb(10.0, 30), ) }); @@ -1523,16 +1524,19 @@ fn realistic_env_max_load_test( (duration.as_secs() / 10).max(60), ) .add_system_metrics_threshold(SystemMetricsThreshold::new( - // Check that we don't use more than 12 CPU cores for 30% of the time. + // Check that we don't use more than 14 CPU cores for 30% of the time. MetricsThreshold::new(14.0, max_cpu_threshold), // Check that we don't use more than 10 GB of memory for 30% of the time. MetricsThreshold::new_gb(10.0, 30), )) - .add_latency_threshold(3.0, LatencyType::P50) - .add_latency_threshold(5.0, LatencyType::P90) - .add_latency_breakdown_threshold(LatencyBreakdownThreshold::new_strict( - 0.3, 0.25, 0.8, 0.6, - )) + .add_latency_threshold(3.4, LatencyType::P50) + .add_latency_threshold(4.5, LatencyType::P90) + .add_latency_breakdown_threshold(LatencyBreakdownThreshold::new_strict(vec![ + (LatencyBreakdownSlice::QsBatchToPos, 0.3), + (LatencyBreakdownSlice::QsPosToProposal, 0.25), + (LatencyBreakdownSlice::ConsensusProposalToOrdered, 0.8), + (LatencyBreakdownSlice::ConsensusOrderedToCommit, 0.6), + ])) .add_chain_progress(StateProgressThreshold { max_no_progress_secs: 10.0, max_round_gap: 4, diff --git a/testsuite/forge/src/backend/k8s/prometheus.rs b/testsuite/forge/src/backend/k8s/prometheus.rs index f59f6810888d7..d0d28b7f729df 100644 --- a/testsuite/forge/src/backend/k8s/prometheus.rs +++ b/testsuite/forge/src/backend/k8s/prometheus.rs @@ -105,28 +105,36 @@ pub fn construct_query_with_extra_labels( labels_map: &BTreeMap, ) -> String { // edit the query string to insert swarm metadata - let mut new_query = query.to_string(); - let mut label_start_idx = query.find('{').unwrap_or(query.len()); - if label_start_idx == query.len() { - // add a new curly and insert after it - new_query.insert_str(query.len(), "{}"); - label_start_idx += 1; - } else { - // add a comma prefix to the existing labels and insert before it - label_start_idx += 1; - new_query.insert(label_start_idx, ','); - } + let mut new_query = "".to_string(); let mut labels_strs = vec![]; for (k, v) in labels_map { labels_strs.push(format!(r#"{}="{}""#, k, v)); } - let labels = labels_strs.join(","); - // assume no collisions in Forge namespace - new_query.insert_str(label_start_idx, &labels); - new_query + let parts: Vec<&str> = query.split_inclusive('{').collect(); + if parts.len() == 1 { + // no labels in query + format!("{}{{{}}}", query, labels) + } else { + let mut parts_iter = parts.into_iter(); + let prev = parts_iter.next(); + new_query.push_str(prev.unwrap()); + + for part in parts_iter { + if part.starts_with('}') { + // assume no collisions in Forge namespace + new_query.push_str(&labels); + } else { + // assume no collisions in Forge namespace + new_query.push_str(&labels); + new_query.push(','); + } + new_query.push_str(part); + } + new_query + } } pub async fn query_with_metadata( @@ -169,16 +177,14 @@ pub async fn query_range_with_metadata( new_query ) })?; - let range = r.as_range() - .ok_or_else(|| { - anyhow!( - "Failed to get range from prometheus response. start={}, end={}, query={}", - start_time, - end_time, - new_query - ) - })?; - info!("For Query {} got range {:?}", new_query, range); + let range = r.as_range().ok_or_else(|| { + anyhow!( + "Failed to get range from prometheus response. start={}, end={}, query={}", + start_time, + end_time, + new_query + ) + })?; if range.len() != 1 { bail!( "Expected only one range vector from prometheus, recieved {} ({:?}). start={}, end={}, query={}", @@ -191,14 +197,7 @@ pub async fn query_range_with_metadata( } Ok(range .first() - .ok_or_else(|| { - anyhow!( - "Empty range vector returned from prometheus. start={}, end={}, query={}", - start_time, - end_time, - new_query - ) - })? + .unwrap() // safe because we checked length above .samples() .to_vec()) } @@ -324,22 +323,32 @@ mod tests { #[test] fn test_create_query() { - // test when no existing labels - let original_query = "aptos_connections"; let mut labels_map = BTreeMap::new(); labels_map.insert("a".to_string(), "a".to_string()); labels_map.insert("some_label".to_string(), "blabla".to_string()); + + // test when no existing labels + let original_query = "aptos_connections"; + let expected_query = r#"aptos_connections{a="a",some_label="blabla"}"#; + let new_query = construct_query_with_extra_labels(original_query, &labels_map); + assert_eq!(expected_query, new_query); + + // test when empty labels + let original_query = "aptos_connections{}"; let expected_query = r#"aptos_connections{a="a",some_label="blabla"}"#; let new_query = construct_query_with_extra_labels(original_query, &labels_map); assert_eq!(expected_query, new_query); // test when existing labels let original_query = r#"aptos_connections{abc="123",def="456"}"#; - let mut labels_map = BTreeMap::new(); - labels_map.insert("a".to_string(), "a".to_string()); - labels_map.insert("some_label".to_string(), "blabla".to_string()); let expected_query = r#"aptos_connections{a="a",some_label="blabla",abc="123",def="456"}"#; let new_query = construct_query_with_extra_labels(original_query, &labels_map); assert_eq!(expected_query, new_query); + + // test when multiple queries + let original_query = r#"aptos_connections{abc="123",def="456"} - aptos_disconnects{abc="123"} / aptos_count{}"#; + let expected_query = r#"aptos_connections{a="a",some_label="blabla",abc="123",def="456"} - aptos_disconnects{a="a",some_label="blabla",abc="123"} / aptos_count{a="a",some_label="blabla"}"#; + let new_query = construct_query_with_extra_labels(original_query, &labels_map); + assert_eq!(expected_query, new_query); } } diff --git a/testsuite/forge/src/success_criteria.rs b/testsuite/forge/src/success_criteria.rs index a6ab3a782dd0a..5856ee21c905d 100644 --- a/testsuite/forge/src/success_criteria.rs +++ b/testsuite/forge/src/success_criteria.rs @@ -105,30 +105,13 @@ pub struct LatencyBreakdownThreshold { } impl LatencyBreakdownThreshold { - pub fn new_strict( - qs_batch_to_pos_threshold: f64, - qs_pos_to_proposal_threshold: f64, - consensus_proposal_to_ordered_threshold: f64, - consensus_ordered_to_commit_threshold: f64, - ) -> Self { - let mut thresholds = BTreeMap::new(); - thresholds.insert( - LatencyBreakdownSlice::QsBatchToPos, - MetricsThreshold::new(qs_batch_to_pos_threshold, 0), - ); - thresholds.insert( - LatencyBreakdownSlice::QsPosToProposal, - MetricsThreshold::new(qs_pos_to_proposal_threshold, 0), - ); - thresholds.insert( - LatencyBreakdownSlice::ConsensusProposalToOrdered, - MetricsThreshold::new(consensus_proposal_to_ordered_threshold, 0), - ); - thresholds.insert( - LatencyBreakdownSlice::ConsensusOrderedToCommit, - MetricsThreshold::new(consensus_ordered_to_commit_threshold, 0), - ); - Self { thresholds } + pub fn new_strict(thresholds: Vec<(LatencyBreakdownSlice, f64)>) -> Self { + Self { + thresholds: thresholds + .into_iter() + .map(|(k, v)| (k, MetricsThreshold::new(v, 0))) + .collect(), + } } pub fn ensure_threshold(&self, metrics: &LatencyBreakdown) -> anyhow::Result<()> { diff --git a/testsuite/testcases/src/lib.rs b/testsuite/testcases/src/lib.rs index fbdf6a9a58ef6..4a6b296795634 100644 --- a/testsuite/testcases/src/lib.rs +++ b/testsuite/testcases/src/lib.rs @@ -339,7 +339,7 @@ impl dyn NetworkLoadTest { phase_timing[i].end_unixtime_s, ))?; info!( - "latency_breakdown: from {} to {}: {:?}", + "Latency breakdown: from {} to {}: {:?}", phase_timing[i].start_unixtime_s, phase_timing[i].end_unixtime_s, latency_breakdown ); stats_by_phase_filtered.push(LoadTestPhaseStats {