[draft] surface counters in forge, to be able to assert them in succe…

…ss criteria
aptos-labs · Aug 1, 2023 · fee3b95 · fee3b95
1 parent b422401
commit fee3b95
Show file tree

Hide file tree

Showing 12 changed files with 407 additions and 170 deletions.
diff --git a/testsuite/forge-cli/src/main.rs b/testsuite/forge-cli/src/main.rs
@@ -6,7 +6,9 @@ use anyhow::{format_err, Context, Result};
 use aptos_config::config::{ChainHealthBackoffValues, ConsensusConfig, PipelineBackpressureValues};
 use aptos_forge::{
     args::TransactionTypeArg,
-    success_criteria::{LatencyType, StateProgressThreshold, SuccessCriteria},
+    success_criteria::{
+        LatencyBreakdownThreshold, LatencyType, StateProgressThreshold, SuccessCriteria,
+    },
     system_metrics::{MetricsThreshold, SystemMetricsThreshold},
     ForgeConfig, Options, *,
 };
@@ -41,7 +43,7 @@ use aptos_testcases::{
     validator_reboot_stress_test::ValidatorRebootStressTest,
     CompositeNetworkTest,
 };
-use clap::{Parser, Subcommand};
+use clap::{Parser, Subcommand, __derive_refs::once_cell::sync::Lazy};
 use futures::stream::{FuturesUnordered, StreamExt};
 use rand::{rngs::ThreadRng, seq::SliceRandom, Rng};
 use std::{
@@ -216,6 +218,24 @@ struct Resize {
     enable_haproxy: bool,
 }
 
+// common metrics thresholds:
+static SYSTEM_12_CORES_5GB_THRESHOLD: Lazy<SystemMetricsThreshold> = Lazy::new(|| {
+    SystemMetricsThreshold::new(
+        // Check that we don't use more than 12 CPU cores for 30% of the time.
+        MetricsThreshold::new(12.0, 30),
+        // Check that we don't use more than 5 GB of memory for 30% of the time.
+        MetricsThreshold::new_gb(5.0, 30),
+    )
+});
+static SYSTEM_12_CORES_10GB_THRESHOLD: Lazy<SystemMetricsThreshold> = Lazy::new(|| {
+    SystemMetricsThreshold::new(
+        // Check that we don't use more than 12 CPU cores for 30% of the time.
+        MetricsThreshold::new(12.0, 30),
+        // Check that we don't use more than 5 GB of memory for 30% of the time.
+        MetricsThreshold::new_gb(10.0, 30),
+    )
+});
+
 /// Make an easy to remember random namespace for your testnet
 fn random_namespace<R: Rng>(dictionary: Vec<String>, rng: &mut R) -> Result<String> {
     // Pick four random words
@@ -679,12 +699,7 @@ fn twin_validator_test() -> ForgeConfig {
             SuccessCriteria::new(5500)
                 .add_no_restarts()
                 .add_wait_for_catchup_s(60)
-                .add_system_metrics_threshold(SystemMetricsThreshold::new(
-                    // Check that we don't use more than 12 CPU cores for 30% of the time.
-                    MetricsThreshold::new(12, 30),
-                    // Check that we don't use more than 5 GB of memory for 30% of the time.
-                    MetricsThreshold::new(5 * 1024 * 1024 * 1024, 30),
-                ))
+                .add_system_metrics_threshold(SYSTEM_12_CORES_5GB_THRESHOLD.clone())
                 .add_chain_progress(StateProgressThreshold {
                     max_no_progress_secs: 10.0,
                     max_round_gap: 4,
@@ -958,9 +973,9 @@ fn graceful_overload() -> ForgeConfig {
                 .add_wait_for_catchup_s(120)
                 .add_system_metrics_threshold(SystemMetricsThreshold::new(
                     // Check that we don't use more than 12 CPU cores for 30% of the time.
-                    MetricsThreshold::new(12, 40),
+                    MetricsThreshold::new(12.0, 40),
                     // Check that we don't use more than 5 GB of memory for 30% of the time.
-                    MetricsThreshold::new(5 * 1024 * 1024 * 1024, 30),
+                    MetricsThreshold::new_gb(5.0, 30),
                 ))
                 .add_latency_threshold(10.0, LatencyType::P50)
                 .add_latency_threshold(30.0, LatencyType::P90)
@@ -1009,9 +1024,9 @@ fn realistic_env_graceful_overload() -> ForgeConfig {
                 .add_system_metrics_threshold(SystemMetricsThreshold::new(
                     // overload test uses more CPUs than others, so increase the limit
                     // Check that we don't use more than 18 CPU cores for 30% of the time.
-                    MetricsThreshold::new(18, 40),
+                    MetricsThreshold::new(18.0, 40),
                     // Check that we don't use more than 5 GB of memory for 30% of the time.
-                    MetricsThreshold::new(5 * 1024 * 1024 * 1024, 30),
+                    MetricsThreshold::new_gb(5.0, 30),
                 ))
                 .add_latency_threshold(10.0, LatencyType::P50)
                 .add_latency_threshold(30.0, LatencyType::P90)
@@ -1417,12 +1432,7 @@ fn validators_join_and_leave() -> ForgeConfig {
             SuccessCriteria::new(5000)
                 .add_no_restarts()
                 .add_wait_for_catchup_s(240)
-                .add_system_metrics_threshold(SystemMetricsThreshold::new(
-                    // Check that we don't use more than 12 CPU cores for 30% of the time.
-                    MetricsThreshold::new(12, 30),
-                    // Check that we don't use more than 10 GB of memory for 30% of the time.
-                    MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
-                ))
+                .add_system_metrics_threshold(SYSTEM_12_CORES_10GB_THRESHOLD.clone())
                 .add_chain_progress(StateProgressThreshold {
                     max_no_progress_secs: 10.0,
                     max_round_gap: 4,
@@ -1452,12 +1462,7 @@ fn land_blocking_test_suite(duration: Duration) -> ForgeConfig {
                 // Give at least 60s for catchup, give 10% of the run for longer durations.
                 (duration.as_secs() / 10).max(60),
             )
-            .add_system_metrics_threshold(SystemMetricsThreshold::new(
-                // Check that we don't use more than 12 CPU cores for 30% of the time.
-                MetricsThreshold::new(12, 30),
-                // Check that we don't use more than 10 GB of memory for 30% of the time.
-                MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
-            ))
+            .add_system_metrics_threshold(SYSTEM_12_CORES_10GB_THRESHOLD.clone())
             .add_chain_progress(StateProgressThreshold {
                 max_no_progress_secs: 10.0,
                 max_round_gap: 4,
@@ -1519,12 +1524,16 @@ fn realistic_env_max_load_test(
                 )
                 .add_system_metrics_threshold(SystemMetricsThreshold::new(
                     // Check that we don't use more than 12 CPU cores for 30% of the time.
-                    MetricsThreshold::new(14, max_cpu_threshold),
+                    MetricsThreshold::new(14.0, max_cpu_threshold),
                     // Check that we don't use more than 10 GB of memory for 30% of the time.
-                    MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
+                    MetricsThreshold::new_gb(10.0, 30),
                 ))
                 .add_latency_threshold(3.0, LatencyType::P50)
                 .add_latency_threshold(5.0, LatencyType::P90)
+                .add_latency_breakdown_threshold(LatencyBreakdownThreshold {
+                    block_create_to_ordered_threshold: MetricsThreshold::new(1.5, 0),
+                    block_create_to_commit_threshold: MetricsThreshold::new(2.5, 0),
+                })
                 .add_chain_progress(StateProgressThreshold {
                     max_no_progress_secs: 10.0,
                     max_round_gap: 4,
@@ -1580,9 +1589,9 @@ fn realistic_network_tuned_for_throughput_test() -> ForgeConfig {
                     // Tuned for throughput uses more cores than regular tests,
                     // as it achieves higher throughput.
                     // Check that we don't use more than 14 CPU cores for 30% of the time.
-                    MetricsThreshold::new(14, 30),
+                    MetricsThreshold::new(14.0, 30),
                     // Check that we don't use more than 10 GB of memory for 30% of the time.
-                    MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
+                    MetricsThreshold::new_gb(10.0, 30),
                 ))
                 .add_chain_progress(StateProgressThreshold {
                     max_no_progress_secs: 10.0,
@@ -1612,12 +1621,7 @@ fn chaos_test_suite(duration: Duration) -> ForgeConfig {
                 },
             )
             .add_no_restarts()
-            .add_system_metrics_threshold(SystemMetricsThreshold::new(
-                // Check that we don't use more than 12 CPU cores for 30% of the time.
-                MetricsThreshold::new(12, 30),
-                // Check that we don't use more than 5 GB of memory for 30% of the time.
-                MetricsThreshold::new(5 * 1024 * 1024 * 1024, 30),
-            )),
+            .add_system_metrics_threshold(SYSTEM_12_CORES_5GB_THRESHOLD.clone()),
         )
 }
 
@@ -1791,12 +1795,7 @@ fn quorum_store_reconfig_enable_test() -> ForgeConfig {
             SuccessCriteria::new(5000)
                 .add_no_restarts()
                 .add_wait_for_catchup_s(240)
-                .add_system_metrics_threshold(SystemMetricsThreshold::new(
-                    // Check that we don't use more than 12 CPU cores for 30% of the time.
-                    MetricsThreshold::new(12, 30),
-                    // Check that we don't use more than 10 GB of memory for 30% of the time.
-                    MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
-                ))
+                .add_system_metrics_threshold(SYSTEM_12_CORES_10GB_THRESHOLD.clone())
                 .add_chain_progress(StateProgressThreshold {
                     max_no_progress_secs: 10.0,
                     max_round_gap: 4,
@@ -1863,12 +1862,7 @@ fn multiregion_benchmark_test() -> ForgeConfig {
                     // Give at least 60s for catchup, give 10% of the run for longer durations.
                     180,
                 )
-                .add_system_metrics_threshold(SystemMetricsThreshold::new(
-                    // Check that we don't use more than 12 CPU cores for 30% of the time.
-                    MetricsThreshold::new(12, 30),
-                    // Check that we don't use more than 10 GB of memory for 30% of the time.
-                    MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
-                ))
+                .add_system_metrics_threshold(SYSTEM_12_CORES_10GB_THRESHOLD.clone())
                 .add_chain_progress(StateProgressThreshold {
                     max_no_progress_secs: 10.0,
                     max_round_gap: 4,

diff --git a/testsuite/forge/src/backend/k8s/prometheus.rs b/testsuite/forge/src/backend/k8s/prometheus.rs
@@ -2,12 +2,23 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use crate::{create_k8s_client, K8sApi, ReadWrite, Result};
-use anyhow::bail;
+use again::RetryPolicy;
+use anyhow::{anyhow, bail};
 use aptos_logger::info;
 use k8s_openapi::api::core::v1::Secret;
-use prometheus_http_query::{response::PromqlResult, Client as PrometheusClient};
+use once_cell::sync::Lazy;
+use prometheus_http_query::{
+    response::{PromqlResult, Sample},
+    Client as PrometheusClient,
+};
 use reqwest::{header, Client as HttpClient};
-use std::{collections::BTreeMap, sync::Arc};
+use std::{collections::BTreeMap, sync::Arc, time::Duration};
+
+static PROMETHEUS_RETRY_POLICY: Lazy<RetryPolicy> = Lazy::new(|| {
+    RetryPolicy::exponential(Duration::from_millis(125))
+        .with_max_retries(3)
+        .with_jitter(true)
+});
 
 pub async fn get_prometheus_client() -> Result<PrometheusClient> {
     // read from the environment
@@ -91,7 +102,7 @@ async fn create_prometheus_client_from_environment(
 
 pub fn construct_query_with_extra_labels(
     query: &str,
-    labels_map: BTreeMap<String, String>,
+    labels_map: &BTreeMap<String, String>,
 ) -> String {
     // edit the query string to insert swarm metadata
     let mut new_query = query.to_string();
@@ -123,13 +134,39 @@ pub async fn query_with_metadata(
     query: &str,
     time: Option<i64>,
     timeout: Option<i64>,
-    labels_map: BTreeMap<String, String>,
+    labels_map: &BTreeMap<String, String>,
 ) -> Result<PromqlResult> {
     let new_query = construct_query_with_extra_labels(query, labels_map);
-    match prom_client.query(&new_query, time, timeout).await {
-        Ok(r) => Ok(r),
-        Err(e) => bail!(e),
-    }
+    let new_query_ref = &new_query;
+    PROMETHEUS_RETRY_POLICY
+        .retry(move || prom_client.query(new_query_ref, time, timeout))
+        .await
+        .map_err(|e| anyhow!("Failed to query prometheus for {}: {}", query, e))
+}
+
+pub async fn query_range_with_metadata(
+    prom_client: &PrometheusClient,
+    query: &str,
+    start_time: i64,
+    end_time: i64,
+    internal_secs: f64,
+    timeout: Option<i64>,
+    labels_map: &BTreeMap<String, String>,
+) -> Result<Vec<Sample>> {
+    let new_query = construct_query_with_extra_labels(query, labels_map);
+    let new_query_ref = &new_query;
+    let r = PROMETHEUS_RETRY_POLICY
+        .retry(move || {
+            prom_client.query_range(new_query_ref, start_time, end_time, internal_secs, timeout)
+        })
+        .await
+        .map_err(|e| anyhow!("Failed to query prometheus for {}: {}", query, e))?;
+    Ok(r.as_range()
+        .ok_or_else(|| anyhow!("Failed to get range from prometheus response"))?
+        .first()
+        .ok_or_else(|| anyhow!("Empty range vector returned from prometheus"))?
+        .samples()
+        .to_vec())
 }
 
 #[cfg(test)]
@@ -259,7 +296,7 @@ mod tests {
         labels_map.insert("a".to_string(), "a".to_string());
         labels_map.insert("some_label".to_string(), "blabla".to_string());
         let expected_query = r#"aptos_connections{a="a",some_label="blabla"}"#;
-        let new_query = construct_query_with_extra_labels(original_query, labels_map);
+        let new_query = construct_query_with_extra_labels(original_query, &labels_map);
         assert_eq!(expected_query, new_query);
 
         // test when existing labels
@@ -268,7 +305,7 @@ mod tests {
         labels_map.insert("a".to_string(), "a".to_string());
         labels_map.insert("some_label".to_string(), "blabla".to_string());
         let expected_query = r#"aptos_connections{a="a",some_label="blabla",abc="123",def="456"}"#;
-        let new_query = construct_query_with_extra_labels(original_query, labels_map);
+        let new_query = construct_query_with_extra_labels(original_query, &labels_map);
         assert_eq!(expected_query, new_query);
     }
 }
diff --git a/testsuite/forge/src/backend/k8s/swarm.rs b/testsuite/forge/src/backend/k8s/swarm.rs
@@ -7,7 +7,7 @@ use crate::{
     get_free_port, get_stateful_set_image, install_public_fullnode,
     interface::system_metrics::{query_prometheus_system_metrics, SystemMetricsThreshold},
     node::K8sNode,
-    prometheus::{self, query_with_metadata},
+    prometheus::{self, query_range_with_metadata, query_with_metadata},
     query_sequence_number, set_stateful_set_image_tag, uninstall_testnet_resources, ChainInfo,
     FullNode, K8sApi, Node, Result, Swarm, SwarmChaos, Validator, Version, HAPROXY_SERVICE_SUFFIX,
     REST_API_HAPROXY_SERVICE_PORT, REST_API_SERVICE_PORT,
@@ -29,7 +29,10 @@ use kube::{
     api::{Api, ListParams},
     client::Client as K8sClient,
 };
-use prometheus_http_query::{response::PromqlResult, Client as PrometheusClient};
+use prometheus_http_query::{
+    response::{PromqlResult, Sample},
+    Client as PrometheusClient,
+};
 use regex::Regex;
 use std::{
     collections::{BTreeMap, HashMap, HashSet},
@@ -402,7 +405,31 @@ impl Swarm for K8sSwarm {
         if let Some(c) = &self.prom_client {
             let mut labels_map = BTreeMap::new();
             labels_map.insert("namespace".to_string(), self.kube_namespace.clone());
-            return query_with_metadata(c, query, time, timeout, labels_map).await;
+            return query_with_metadata(c, query, time, timeout, &labels_map).await;
+        }
+        bail!("No prom client");
+    }
+
+    async fn query_range_metrics(
+        &self,
+        query: &str,
+        start_time: i64,
+        end_time: i64,
+        timeout: Option<i64>,
+    ) -> Result<Vec<Sample>> {
+        if let Some(c) = &self.prom_client {
+            let mut labels_map = BTreeMap::new();
+            labels_map.insert("namespace".to_string(), self.kube_namespace.clone());
+            return query_range_with_metadata(
+                c,
+                query,
+                start_time,
+                end_time,
+                30.0,
+                timeout,
+                &labels_map,
+            )
+            .await;
         }
         bail!("No prom client");
     }

diff --git a/testsuite/forge/src/backend/local/swarm.rs b/testsuite/forge/src/backend/local/swarm.rs
@@ -24,7 +24,7 @@ use aptos_sdk::{
         PeerId,
     },
 };
-use prometheus_http_query::response::PromqlResult;
+use prometheus_http_query::response::{PromqlResult, Sample};
 use std::{
     collections::HashMap,
     fs,
@@ -628,6 +628,16 @@ impl Swarm for LocalSwarm {
         todo!()
     }
 
+    async fn query_range_metrics(
+        &self,
+        _query: &str,
+        _start_time: i64,
+        _end_time: i64,
+        _timeout: Option<i64>,
+    ) -> Result<Vec<Sample>> {
+        todo!()
+    }
+
     async fn ensure_healthy_system_metrics(
         &mut self,
         _start_time: i64,

diff --git a/testsuite/forge/src/interface/network.rs b/testsuite/forge/src/interface/network.rs
@@ -4,7 +4,7 @@
 
 use super::Test;
 use crate::{
-    success_criteria::{SuccessCriteria, SuccessCriteriaChecker},
+    success_criteria::{LatencyBreakdown, SuccessCriteria, SuccessCriteriaChecker},
     CoreContext, Result, Swarm, TestReport,
 };
 use aptos_transaction_emitter_lib::{EmitJobRequest, TxnStats};
@@ -61,6 +61,7 @@ impl<'t> NetworkContext<'t> {
         &mut self,
         stats: &TxnStats,
         window: Duration,
+        latency_breakdown: &LatencyBreakdown,
         start_time: i64,
         end_time: i64,
         start_version: u64,
@@ -73,6 +74,7 @@ impl<'t> NetworkContext<'t> {
                 self.report,
                 stats,
                 window,
+                latency_breakdown,
                 start_time,
                 end_time,
                 start_version,