diff --git a/client/network/src/service.rs b/client/network/src/service.rs index fb33901dd09c3..48b5c30d3ef44 100644 --- a/client/network/src/service.rs +++ b/client/network/src/service.rs @@ -38,13 +38,13 @@ use crate::{ }; use futures::prelude::*; use libp2p::{PeerId, Multiaddr}; -use libp2p::core::{Executor, connection::PendingConnectionError}; +use libp2p::core::{ConnectedPoint, Executor, connection::{ConnectionError, PendingConnectionError}}; use libp2p::kad::record; -use libp2p::swarm::{NetworkBehaviour, SwarmBuilder, SwarmEvent}; +use libp2p::swarm::{NetworkBehaviour, SwarmBuilder, SwarmEvent, protocols_handler::NodeHandlerWrapperError}; use log::{error, info, trace, warn}; use parking_lot::Mutex; use prometheus_endpoint::{ - register, Counter, Gauge, GaugeVec, HistogramOpts, HistogramVec, Opts, PrometheusError, Registry, U64, + register, Counter, CounterVec, Gauge, GaugeVec, HistogramOpts, HistogramVec, Opts, PrometheusError, Registry, U64, }; use sc_peerset::PeersetHandle; use sp_consensus::import_queue::{BlockImportError, BlockImportResult, ImportQueue, Link}; @@ -821,13 +821,18 @@ pub struct NetworkWorker { struct Metrics { // This list is ordered alphabetically - connections: Gauge, + connections: GaugeVec, + connections_closed_total: CounterVec, import_queue_blocks_submitted: Counter, import_queue_finality_proofs_submitted: Counter, import_queue_justifications_submitted: Counter, + incoming_connections_errors_total: Counter, + incoming_connections_total: Counter, is_major_syncing: Gauge, issued_light_requests: Counter, kbuckets_num_nodes: Gauge, + listeners_local_addresses: Gauge, + listeners_errors_total: Counter, network_per_sec_bytes: GaugeVec, notifications_queues_size: HistogramVec, notifications_sizes: HistogramVec, @@ -836,6 +841,8 @@ struct Metrics { peers_count: Gauge, peerset_num_discovered: Gauge, peerset_num_requested: Gauge, + pending_connections: Gauge, + pending_connections_errors_total: CounterVec, random_kademalia_queries_total: Counter, } @@ -843,8 +850,19 @@ impl Metrics { fn register(registry: &Registry) -> Result { Ok(Self { // This list is ordered alphabetically - connections: register(Gauge::new( - "sub_libp2p_connections", "Number of libp2p connections" + connections: register(GaugeVec::new( + Opts::new( + "sub_libp2p_connections", + "Number of active libp2p connections" + ), + &["direction"] + )?, registry)?, + connections_closed_total: register(CounterVec::new( + Opts::new( + "sub_libp2p_connections_closed_total", + "Total number of connections closed, by reason" + ), + &["reason"] )?, registry)?, import_queue_blocks_submitted: register(Counter::new( "import_queue_blocks_submitted", @@ -858,6 +876,14 @@ impl Metrics { "import_queue_justifications_submitted", "Number of justifications submitted to the import queue.", )?, registry)?, + incoming_connections_errors_total: register(Counter::new( + "sub_libp2p_incoming_connections_handshake_errors_total", + "Total number of incoming connections that have failed during the initial handshake" + )?, registry)?, + incoming_connections_total: register(Counter::new( + "sub_libp2p_incoming_connections_total", + "Total number of incoming connections on the listening sockets" + )?, registry)?, is_major_syncing: register(Gauge::new( "sub_libp2p_is_major_syncing", "Whether the node is performing a major sync or not.", )?, registry)?, @@ -868,6 +894,13 @@ impl Metrics { kbuckets_num_nodes: register(Gauge::new( "sub_libp2p_kbuckets_num_nodes", "Number of nodes in the Kademlia k-buckets" )?, registry)?, + listeners_local_addresses: register(Gauge::new( + "sub_libp2p_listeners_local_addresses", "Number of local addresses we're listening on" + )?, registry)?, + listeners_errors_total: register(Counter::new( + "sub_libp2p_listeners_errors_total", + "Total number of non-fatal errors reported by a listener" + )?, registry)?, network_per_sec_bytes: register(GaugeVec::new( Opts::new( "sub_libp2p_network_per_sec_bytes", @@ -916,6 +949,17 @@ impl Metrics { peerset_num_requested: register(Gauge::new( "sub_libp2p_peerset_num_requested", "Number of nodes that the peerset manager wants us to be connected to", )?, registry)?, + pending_connections: register(Gauge::new( + "sub_libp2p_pending_connections", + "Number of connections in the process of being established", + )?, registry)?, + pending_connections_errors_total: register(CounterVec::new( + Opts::new( + "sub_libp2p_pending_connections_errors_total", + "Total number of node connection failures" + ), + &["reason"] + )?, registry)?, random_kademalia_queries_total: register(Counter::new( "sub_libp2p_random_kademalia_queries_total", "Number of random Kademlia queries started", )?, registry)?, @@ -1047,22 +1091,50 @@ impl Future for NetworkWorker { metrics.update_with_network_event(&ev); } }, - Poll::Ready(SwarmEvent::ConnectionEstablished { peer_id, .. }) => { + Poll::Ready(SwarmEvent::ConnectionEstablished { peer_id, endpoint, .. }) => { trace!(target: "sub-libp2p", "Libp2p => Connected({:?})", peer_id); if let Some(metrics) = this.metrics.as_ref() { - metrics.connections.inc(); + match endpoint { + ConnectedPoint::Dialer { .. } => + metrics.connections.with_label_values(&["out"]).inc(), + ConnectedPoint::Listener { .. } => + metrics.connections.with_label_values(&["in"]).inc(), + } + } + }, + Poll::Ready(SwarmEvent::ConnectionClosed { peer_id, cause, endpoint, .. }) => { + trace!(target: "sub-libp2p", "Libp2p => Disconnected({:?}, {:?})", peer_id, cause); + if let Some(metrics) = this.metrics.as_ref() { + match endpoint { + ConnectedPoint::Dialer { .. } => + metrics.connections.with_label_values(&["out"]).dec(), + ConnectedPoint::Listener { .. } => + metrics.connections.with_label_values(&["in"]).dec(), + } + match cause { + ConnectionError::IO(_) => + metrics.connections_closed_total.with_label_values(&["transport-error"]).inc(), + ConnectionError::ConnectionLimit(_) => + metrics.connections_closed_total.with_label_values(&["limit-reached"]).inc(), + ConnectionError::Handler(NodeHandlerWrapperError::Handler(_)) => + metrics.connections_closed_total.with_label_values(&["protocol-error"]).inc(), + ConnectionError::Handler(NodeHandlerWrapperError::KeepAliveTimeout) => + metrics.connections_closed_total.with_label_values(&["keep-alive-timeout"]).inc(), + } + } + }, + Poll::Ready(SwarmEvent::NewListenAddr(addr)) => { + trace!(target: "sub-libp2p", "Libp2p => NewListenAddr({})", addr); + if let Some(metrics) = this.metrics.as_ref() { + metrics.listeners_local_addresses.inc(); } }, - Poll::Ready(SwarmEvent::ConnectionClosed { peer_id, .. }) => { - trace!(target: "sub-libp2p", "Libp2p => Disconnected({:?})", peer_id); + Poll::Ready(SwarmEvent::ExpiredListenAddr(addr)) => { + trace!(target: "sub-libp2p", "Libp2p => ExpiredListenAddr({})", addr); if let Some(metrics) = this.metrics.as_ref() { - metrics.connections.dec(); + metrics.listeners_local_addresses.dec(); } }, - Poll::Ready(SwarmEvent::NewListenAddr(addr)) => - trace!(target: "sub-libp2p", "Libp2p => NewListenAddr({})", addr), - Poll::Ready(SwarmEvent::ExpiredListenAddr(addr)) => - trace!(target: "sub-libp2p", "Libp2p => ExpiredListenAddr({})", addr), Poll::Ready(SwarmEvent::UnreachableAddr { peer_id, address, error, .. }) => { trace!( target: "sub-libp2p", "Libp2p => Failed to reach {:?} through {:?}: {}", @@ -1080,25 +1152,54 @@ impl Future for NetworkWorker { ); } } + + if let Some(metrics) = this.metrics.as_ref() { + match error { + PendingConnectionError::InvalidPeerId => + metrics.pending_connections_errors_total.with_label_values(&["invalid-peer-id"]).inc(), + PendingConnectionError::Transport(_) | PendingConnectionError::IO(_) => + metrics.pending_connections_errors_total.with_label_values(&["transport-error"]).inc(), + } + } } Poll::Ready(SwarmEvent::Dialing(peer_id)) => trace!(target: "sub-libp2p", "Libp2p => Dialing({:?})", peer_id), - Poll::Ready(SwarmEvent::IncomingConnection { local_addr, send_back_addr }) => + Poll::Ready(SwarmEvent::IncomingConnection { local_addr, send_back_addr }) => { trace!(target: "sub-libp2p", "Libp2p => IncomingConnection({},{}))", - local_addr, send_back_addr), - Poll::Ready(SwarmEvent::IncomingConnectionError { local_addr, send_back_addr, error }) => + local_addr, send_back_addr); + if let Some(metrics) = this.metrics.as_ref() { + metrics.incoming_connections_total.inc(); + } + }, + Poll::Ready(SwarmEvent::IncomingConnectionError { local_addr, send_back_addr, error }) => { trace!(target: "sub-libp2p", "Libp2p => IncomingConnectionError({},{}): {}", - local_addr, send_back_addr, error), - Poll::Ready(SwarmEvent::BannedPeer { peer_id, endpoint }) => + local_addr, send_back_addr, error); + if let Some(metrics) = this.metrics.as_ref() { + metrics.incoming_connections_errors_total.inc(); + } + }, + Poll::Ready(SwarmEvent::BannedPeer { peer_id, endpoint }) => { trace!(target: "sub-libp2p", "Libp2p => BannedPeer({}). Connected via {:?}.", - peer_id, endpoint), + peer_id, endpoint); + if let Some(metrics) = this.metrics.as_ref() { + metrics.incoming_connections_errors_total.inc(); + } + }, Poll::Ready(SwarmEvent::UnknownPeerUnreachableAddr { address, error }) => trace!(target: "sub-libp2p", "Libp2p => UnknownPeerUnreachableAddr({}): {}", address, error), - Poll::Ready(SwarmEvent::ListenerClosed { reason, addresses: _ }) => - warn!(target: "sub-libp2p", "Libp2p => ListenerClosed: {:?}", reason), - Poll::Ready(SwarmEvent::ListenerError { error }) => - trace!(target: "sub-libp2p", "Libp2p => ListenerError: {}", error), + Poll::Ready(SwarmEvent::ListenerClosed { reason, addresses }) => { + warn!(target: "sub-libp2p", "Libp2p => ListenerClosed: {:?}", reason); + if let Some(metrics) = this.metrics.as_ref() { + metrics.listeners_local_addresses.sub(addresses.len() as u64); + } + }, + Poll::Ready(SwarmEvent::ListenerError { error }) => { + trace!(target: "sub-libp2p", "Libp2p => ListenerError: {}", error); + if let Some(metrics) = this.metrics.as_ref() { + metrics.listeners_errors_total.inc(); + } + }, }; } @@ -1127,6 +1228,7 @@ impl Future for NetworkWorker { metrics.peers_count.set(num_connected_peers as u64); metrics.peerset_num_discovered.set(this.network_service.user_protocol().num_discovered_peers() as u64); metrics.peerset_num_requested.set(this.network_service.user_protocol().requested_peers().count() as u64); + metrics.pending_connections.set(Swarm::network_info(&this.network_service).num_connections_pending as u64); } Poll::Pending