From 67c3e17805e2e5beb5d535b09490eb277fd7ea25 Mon Sep 17 00:00:00 2001 From: Jeb Bearer Date: Fri, 6 Dec 2024 17:07:42 -0800 Subject: [PATCH] Add metrics exposing a node's idea of its peers' catchup reliability --- Cargo.lock | 2 +- builder/src/non_permissioned.rs | 2 ++ justfile | 6 +++--- marketplace-builder/src/builder.rs | 2 ++ sequencer-sqlite/Cargo.lock | 14 +++++++++++- sequencer/src/api.rs | 9 ++++++++ sequencer/src/catchup.rs | 34 ++++++++++++++++++++++++++---- sequencer/src/lib.rs | 10 ++++++--- 8 files changed, 67 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 768d02a1e..447479d56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7421,7 +7421,7 @@ checksum = "714c75db297bc88a63783ffc6ab9f830698a6705aa0201416931759ef4c8183d" dependencies = [ "autocfg", "equivalent", - "indexmap 2.7.0", + "indexmap 2.6.0", ] [[package]] diff --git a/builder/src/non_permissioned.rs b/builder/src/non_permissioned.rs index ab33c1fb3..e07509d6d 100644 --- a/builder/src/non_permissioned.rs +++ b/builder/src/non_permissioned.rs @@ -19,6 +19,7 @@ use hotshot_types::{ data::{fake_commitment, ViewNumber}, traits::{ block_contents::{vid_commitment, GENESIS_VID_NUM_STORAGE_NODES}, + metrics::NoMetrics, node_implementation::Versions, EncodeBytes, }, @@ -53,6 +54,7 @@ pub async fn build_instance_state( Arc::new(StatePeers::::from_urls( state_peers, Default::default(), + &NoMetrics, )), V::Base::VERSION, ); diff --git a/justfile b/justfile index 5d7521a92..e1ea9300b 100644 --- a/justfile +++ b/justfile @@ -10,13 +10,13 @@ demo *args: demo-native *args: build scripts/demo-native {{args}} -build: +build profile="test": #!/usr/bin/env bash set -euxo pipefail # Use the same target dir for both `build` invocations export CARGO_TARGET_DIR=${CARGO_TARGET_DIR:-target} - cargo build --profile test - cargo build --profile test --manifest-path ./sequencer-sqlite/Cargo.toml + cargo build --profile {{profile}} + cargo build --profile {{profile}} --manifest-path ./sequencer-sqlite/Cargo.toml demo-native-mp *args: build scripts/demo-native -f process-compose.yaml -f process-compose-mp.yml {{args}} diff --git a/marketplace-builder/src/builder.rs b/marketplace-builder/src/builder.rs index 46a2005f5..f0daf5e31 100644 --- a/marketplace-builder/src/builder.rs +++ b/marketplace-builder/src/builder.rs @@ -30,6 +30,7 @@ use hotshot_types::{ data::{fake_commitment, Leaf, ViewNumber}, traits::{ block_contents::{vid_commitment, Transaction as _, GENESIS_VID_NUM_STORAGE_NODES}, + metrics::NoMetrics, node_implementation::{ConsensusTime, NodeType, Versions}, EncodeBytes, }, @@ -74,6 +75,7 @@ pub async fn build_instance_state( Arc::new(StatePeers::::from_urls( state_peers, Default::default(), + &NoMetrics, )), V::Base::version(), ); diff --git a/sequencer-sqlite/Cargo.lock b/sequencer-sqlite/Cargo.lock index ad03aad19..668ac6637 100644 --- a/sequencer-sqlite/Cargo.lock +++ b/sequencer-sqlite/Cargo.lock @@ -4083,7 +4083,7 @@ dependencies = [ [[package]] name = "hotshot-query-service" version = "0.1.75" -source = "git+https://github.com/EspressoSystems/hotshot-query-service?branch=hotshot/0.5.82#5e2c984d19da3826f4cc8d80c5cf1a84dcd377f7" +source = "git+https://github.com/EspressoSystems/hotshot-query-service?tag=v0.1.75#dffefa160f441a663723a67bc54efedb11a88b02" dependencies = [ "anyhow", "ark-serialize", @@ -7139,6 +7139,17 @@ dependencies = [ "uint", ] +[[package]] +name = "priority-queue" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714c75db297bc88a63783ffc6ab9f830698a6705aa0201416931759ef4c8183d" +dependencies = [ + "autocfg", + "equivalent", + "indexmap 2.7.0", +] + [[package]] name = "proc-macro-crate" version = "3.2.0" @@ -8359,6 +8370,7 @@ dependencies = [ "num_enum", "parking_lot", "portpicker", + "priority-queue", "rand 0.8.5", "rand_chacha 0.3.1", "rand_distr", diff --git a/sequencer/src/api.rs b/sequencer/src/api.rs index 9b6a38fa7..a3cd5c593 100644 --- a/sequencer/src/api.rs +++ b/sequencer/src/api.rs @@ -1527,6 +1527,7 @@ mod test { StatePeers::>::from_urls( vec![format!("http://localhost:{port}").parse().unwrap()], Default::default(), + &NoMetrics, ) })) .build(); @@ -1571,6 +1572,7 @@ mod test { StatePeers::>::from_urls( vec![format!("http://localhost:{port}").parse().unwrap()], Default::default(), + &NoMetrics, ), &NoMetrics, test_helpers::STAKE_TABLE_CAPACITY_FOR_TEST, @@ -1636,6 +1638,7 @@ mod test { StatePeers::>::from_urls( vec![format!("http://localhost:{port}").parse().unwrap()], Default::default(), + &NoMetrics, ) })) .network_config(TestConfigBuilder::default().l1_url(l1).build()) @@ -1713,6 +1716,7 @@ mod test { StatePeers::>::from_urls( vec![format!("http://localhost:{port}").parse().unwrap()], Default::default(), + &NoMetrics, ) })) .network_config(TestConfigBuilder::default().l1_url(l1).build()) @@ -1773,6 +1777,7 @@ mod test { StatePeers::::from_urls( vec![format!("http://localhost:{port}").parse().unwrap()], BackoffParams::default(), + &NoMetrics, ) }); @@ -1780,6 +1785,7 @@ mod test { peers[2] = StatePeers::::from_urls( vec![url.clone()], BackoffParams::default(), + &NoMetrics, ); let config = TestNetworkConfigBuilder::::with_num_nodes() @@ -1966,6 +1972,7 @@ mod test { StatePeers::::from_urls( vec![format!("http://localhost:{port}").parse().unwrap()], Default::default(), + &NoMetrics, ) })) .network_config( @@ -2139,6 +2146,7 @@ mod test { StatePeers::>::from_urls( vec![format!("http://localhost:{port}").parse().unwrap()], Default::default(), + &NoMetrics, ) })) .network_config(TestConfigBuilder::default().l1_url(l1).build()) @@ -2203,6 +2211,7 @@ mod test { let peers = StatePeers::>::from_urls( vec!["https://notarealnode.network".parse().unwrap(), url], Default::default(), + &NoMetrics, ); // Fetch the config from node 1, a different node than the one running the service. diff --git a/sequencer/src/catchup.rs b/sequencer/src/catchup.rs index b05ffdaaf..8c8ca7a66 100644 --- a/sequencer/src/catchup.rs +++ b/sequencer/src/catchup.rs @@ -12,7 +12,12 @@ use espresso_types::{ }; use futures::future::{Future, FutureExt, TryFuture, TryFutureExt}; use hotshot_types::{ - data::ViewNumber, network::NetworkConfig, traits::node_implementation::ConsensusTime as _, + data::ViewNumber, + network::NetworkConfig, + traits::{ + metrics::{Counter, CounterFamily, Metrics}, + node_implementation::ConsensusTime as _, + }, ValidatorConfig, }; use itertools::Itertools; @@ -37,12 +42,20 @@ use crate::{ struct Client { inner: surf_disco::Client, url: Url, + requests: Arc>, + failures: Arc>, } impl Client { - pub fn new(url: Url) -> Self { + pub fn new( + url: Url, + requests: &(impl CounterFamily + ?Sized), + failures: &(impl CounterFamily + ?Sized), + ) -> Self { Self { inner: surf_disco::Client::new(url.clone()), + requests: Arc::new(requests.create(vec![url.to_string()])), + failures: Arc::new(failures.create(vec![url.to_string()])), url, } } @@ -169,8 +182,10 @@ impl StatePeers { for (id, success) in requests { scores.change_priority_by(&id, |score| { score.requests += 1; + self.clients[id].requests.add(1); if !success { score.failures += 1; + self.clients[id].failures.add(1); } }); } @@ -178,17 +193,28 @@ impl StatePeers { res } - pub fn from_urls(urls: Vec, backoff: BackoffParams) -> Self { + pub fn from_urls( + urls: Vec, + backoff: BackoffParams, + metrics: &(impl Metrics + ?Sized), + ) -> Self { if urls.is_empty() { panic!("Cannot create StatePeers with no peers"); } + let metrics = metrics.subgroup("catchup".into()); + let requests = metrics.counter_family("requests".into(), vec!["peer".into()]); + let failures = metrics.counter_family("request_failures".into(), vec!["peer".into()]); + let scores = urls .iter() .enumerate() .map(|(i, _)| (i, PeerScore::default())) .collect(); - let clients = urls.into_iter().map(Client::new).collect(); + let clients = urls + .into_iter() + .map(|url| Client::new(url, &*requests, &*failures)) + .collect(); Self { clients, diff --git a/sequencer/src/lib.rs b/sequencer/src/lib.rs index bba6356ad..e03aef910 100644 --- a/sequencer/src/lib.rs +++ b/sequencer/src/lib.rs @@ -53,7 +53,7 @@ use hotshot_types::{ light_client::{StateKeyPair, StateSignKey}, signature_key::{BLSPrivKey, BLSPubKey}, traits::{ - metrics::Metrics, + metrics::{Metrics, NoMetrics}, network::ConnectedNetwork, node_implementation::{NodeImplementation, NodeType, Versions}, }, @@ -312,8 +312,11 @@ pub async fn init_node( // If we were told to fetch the config from an already-started peer, do so. (None, Some(peers)) => { tracing::info!(?peers, "loading network config from peers"); - let peers = - StatePeers::::from_urls(peers, network_params.catchup_backoff); + let peers = StatePeers::::from_urls( + peers, + network_params.catchup_backoff, + &NoMetrics, + ); let config = peers.fetch_config(validator_config.clone()).await?; tracing::info!( @@ -509,6 +512,7 @@ pub async fn init_node( StatePeers::::from_urls( network_params.state_peers, network_params.catchup_backoff, + metrics, ), ) .await,