Skip to content

Commit

Permalink
fix(net): Limit the number of leftover nonces in the self-connection …
Browse files Browse the repository at this point in the history
…nonce set (#6534)

* Use a stricter connection rate limit for successful inbound peer connections

* Limit the number of nonces in the self-connection nonce set

* Rate-limit failed inbound connections as well

* Justify the sleep and the yield_now

* Use the configured connection limit rather than a constant

* Tests that the number of nonces is limited (#37)

* Tests that the number of nonces is limited

* removes unused constant

* test that it reaches the nonce limit

---------

Co-authored-by: Arya <[email protected]>
  • Loading branch information
teor2345 and arya2 authored Apr 18, 2023
1 parent 3343c84 commit 0d50d97
Show file tree
Hide file tree
Showing 7 changed files with 244 additions and 44 deletions.
63 changes: 48 additions & 15 deletions zebra-network/src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,48 @@ pub const MAX_RECENT_PEER_AGE: Duration32 = Duration32::from_days(3);
/// Using a prime number makes sure that heartbeats don't synchronise with crawls.
pub const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(59);

/// The minimum time between successive calls to
/// The minimum time between outbound peer connections, implemented by
/// [`CandidateSet::next`][crate::peer_set::CandidateSet::next].
///
/// ## Security
///
/// Zebra resists distributed denial of service attacks by making sure that new peer connections
/// are initiated at least [`MIN_PEER_CONNECTION_INTERVAL`] apart.
pub const MIN_PEER_CONNECTION_INTERVAL: Duration = Duration::from_millis(25);
/// Zebra resists distributed denial of service attacks by making sure that new outbound peer
/// connections are only initiated after this minimum time has elapsed.
///
/// It also enforces a minimum per-peer reconnection interval, and filters failed outbound peers.
pub const MIN_OUTBOUND_PEER_CONNECTION_INTERVAL: Duration = Duration::from_millis(50);

/// The minimum time between _successful_ inbound peer connections, implemented by
/// `peer_set::initialize::accept_inbound_connections`.
///
/// To support multiple peers connecting simultaneously, this is less than the
/// [`HANDSHAKE_TIMEOUT`].
///
/// ## Security
///
/// Zebra resists distributed denial of service attacks by limiting the inbound connection rate.
/// After a _successful_ inbound connection, new inbound peer connections are only accepted,
/// and our side of the handshake initiated, after this minimum time has elapsed.
///
/// The inbound interval is much longer than the outbound interval, because Zebra does not
/// control the selection or reconnections of inbound peers.
pub const MIN_INBOUND_PEER_CONNECTION_INTERVAL: Duration = Duration::from_secs(1);

/// The minimum time between _failed_ inbound peer connections, implemented by
/// `peer_set::initialize::accept_inbound_connections`.
///
/// This is a tradeoff between:
/// - the memory, CPU, and network usage of each new connection attempt, and
/// - denying service to honest peers due to an attack which makes many inbound connections.
///
/// Attacks that reach this limit should be managed using a firewall or intrusion prevention system.
///
/// ## Security
///
/// Zebra resists distributed denial of service attacks by limiting the inbound connection rate.
/// After a _failed_ inbound connection, new inbound peer connections are only accepted,
/// and our side of the handshake initiated, after this minimum time has elapsed.
pub const MIN_INBOUND_PEER_FAILED_CONNECTION_INTERVAL: Duration = Duration::from_millis(10);

/// The minimum time between successive calls to
/// [`CandidateSet::update`][crate::peer_set::CandidateSet::update].
Expand Down Expand Up @@ -324,9 +358,6 @@ pub mod magics {

#[cfg(test)]
mod tests {

use std::convert::TryFrom;

use zebra_chain::parameters::POST_BLOSSOM_POW_TARGET_SPACING;

use super::*;
Expand Down Expand Up @@ -363,18 +394,20 @@ mod tests {
"The EWMA decay time should be higher than the request timeout, so timed out peers are penalised by the EWMA.");

assert!(
u32::try_from(MAX_ADDRS_IN_ADDRESS_BOOK).expect("fits in u32")
* MIN_PEER_CONNECTION_INTERVAL
< MIN_PEER_RECONNECTION_DELAY,
"each peer should get at least one connection attempt in each connection interval",
MIN_PEER_RECONNECTION_DELAY.as_secs() as f32
/ (u32::try_from(MAX_ADDRS_IN_ADDRESS_BOOK).expect("fits in u32")
* MIN_OUTBOUND_PEER_CONNECTION_INTERVAL)
.as_secs() as f32
>= 0.5,
"most peers should get a connection attempt in each connection interval",
);

assert!(
MIN_PEER_RECONNECTION_DELAY.as_secs()
MIN_PEER_RECONNECTION_DELAY.as_secs() as f32
/ (u32::try_from(MAX_ADDRS_IN_ADDRESS_BOOK).expect("fits in u32")
* MIN_PEER_CONNECTION_INTERVAL)
.as_secs()
<= 2,
* MIN_OUTBOUND_PEER_CONNECTION_INTERVAL)
.as_secs() as f32
<= 2.0,
"each peer should only have a few connection attempts in each connection interval",
);
}
Expand Down
51 changes: 45 additions & 6 deletions zebra-network/src/peer/handshake.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
use std::{
cmp::min,
collections::HashSet,
fmt,
future::Future,
net::{IpAddr, Ipv4Addr, SocketAddr},
Expand All @@ -14,6 +13,7 @@ use std::{

use chrono::{TimeZone, Utc};
use futures::{channel::oneshot, future, pin_mut, FutureExt, SinkExt, StreamExt};
use indexmap::IndexSet;
use tokio::{
io::{AsyncRead, AsyncWrite},
sync::broadcast,
Expand Down Expand Up @@ -48,6 +48,9 @@ use crate::{
BoxError, Config, VersionMessage,
};

#[cfg(test)]
mod tests;

/// A [`Service`] that handshakes with a remote peer and constructs a
/// client/server pair.
///
Expand All @@ -71,7 +74,7 @@ where
address_book_updater: tokio::sync::mpsc::Sender<MetaAddrChange>,
inv_collector: broadcast::Sender<InventoryChange>,
minimum_peer_version: MinimumPeerVersion<C>,
nonces: Arc<futures::lock::Mutex<HashSet<Nonce>>>,
nonces: Arc<futures::lock::Mutex<IndexSet<Nonce>>>,

parent_span: Span,
}
Expand Down Expand Up @@ -515,7 +518,7 @@ where
let (tx, _rx) = tokio::sync::mpsc::channel(1);
tx
});
let nonces = Arc::new(futures::lock::Mutex::new(HashSet::new()));
let nonces = Arc::new(futures::lock::Mutex::new(IndexSet::new()));
let user_agent = self.user_agent.unwrap_or_default();
let our_services = self.our_services.unwrap_or_else(PeerServices::empty);
let relay = self.relay.unwrap_or(false);
Expand Down Expand Up @@ -572,7 +575,7 @@ pub async fn negotiate_version<PeerTransport>(
peer_conn: &mut Framed<PeerTransport, Codec>,
connected_addr: &ConnectedAddr,
config: Config,
nonces: Arc<futures::lock::Mutex<HashSet<Nonce>>>,
nonces: Arc<futures::lock::Mutex<IndexSet<Nonce>>>,
user_agent: String,
our_services: PeerServices,
relay: bool,
Expand All @@ -583,12 +586,43 @@ where
{
// Create a random nonce for this connection
let local_nonce = Nonce::default();

// Insert the nonce for this handshake into the shared nonce set.
// Each connection has its own connection state, and handshakes execute concurrently.
//
// # Correctness
//
// It is ok to wait for the lock here, because handshakes have a short
// timeout, and the async mutex will be released when the task times
// out.
nonces.lock().await.insert(local_nonce);
//
// Duplicate nonces don't matter here, because 64-bit random collisions are very rare.
// If they happen, we're probably replacing a leftover nonce from a failed connection,
// which wasn't cleaned up when it closed.
{
let mut locked_nonces = nonces.lock().await;
locked_nonces.insert(local_nonce);

// # Security
//
// Limit the amount of memory used for nonces.
// Nonces can be left in the set if the connection fails or times out between
// the nonce being inserted, and it being removed.
//
// Zebra has strict connection limits, so we limit the number of nonces to
// the configured connection limit.
// This is a tradeoff between:
// - avoiding memory denial of service attacks which make large numbers of connections,
// for example, 100 failed inbound connections takes 1 second.
// - memory usage: 16 bytes per `Nonce`, 3.2 kB for 200 nonces
// - collision probability: 2^32 has ~50% collision probability, so we use a lower limit
// <https://en.wikipedia.org/wiki/Birthday_problem#Probability_of_a_shared_birthday_(collision)>
while locked_nonces.len() > config.peerset_total_connection_limit() {
locked_nonces.shift_remove_index(0);
}

std::mem::drop(locked_nonces);
};

// Don't leak our exact clock skew to our peers. On the other hand,
// we can't deviate too much, or zcashd will get confused.
Expand Down Expand Up @@ -684,10 +718,15 @@ where
// We must wait for the lock before we continue with the connection, to avoid
// self-connection. If the connection times out, the async lock will be
// released.
//
// # Security
//
// Connections that get a `Version` message will remove their nonces here,
// but connections that fail before this point can leave their nonces in the nonce set.
let nonce_reuse = {
let mut locked_nonces = nonces.lock().await;
let nonce_reuse = locked_nonces.contains(&remote.nonce);
// Regardless of whether we observed nonce reuse, clean up the nonce set.
// Regardless of whether we observed nonce reuse, remove our own nonce from the nonce set.
locked_nonces.remove(&local_nonce);
nonce_reuse
};
Expand Down
15 changes: 15 additions & 0 deletions zebra-network/src/peer/handshake/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
//! Implements methods for testing [`Handshake`]
use super::*;

impl<S, C> Handshake<S, C>
where
S: Service<Request, Response = Response, Error = BoxError> + Clone + Send + 'static,
S::Future: Send,
C: ChainTip + Clone + Send + 'static,
{
/// Returns a count of how many connection nonces are stored in this [`Handshake`]
pub async fn nonce_count(&self) -> usize {
self.nonces.lock().await.len()
}
}
7 changes: 5 additions & 2 deletions zebra-network/src/peer_set/candidate_set.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//! Candidate peer selection for outbound connections using the [`CandidateSet`].
use std::{cmp::min, sync::Arc};

use chrono::Utc;
Expand Down Expand Up @@ -361,7 +363,8 @@ where
///
/// Zebra resists distributed denial of service attacks by making sure that
/// new peer connections are initiated at least
/// [`MIN_PEER_CONNECTION_INTERVAL`][constants::MIN_PEER_CONNECTION_INTERVAL] apart.
/// [`MIN_OUTBOUND_PEER_CONNECTION_INTERVAL`][constants::MIN_OUTBOUND_PEER_CONNECTION_INTERVAL]
/// apart.
///
/// [`Responded`]: crate::PeerAddrState::Responded
pub async fn next(&mut self) -> Option<MetaAddr> {
Expand Down Expand Up @@ -397,7 +400,7 @@ where

// Security: rate-limit new outbound peer connections
sleep_until(self.min_next_handshake).await;
self.min_next_handshake = Instant::now() + constants::MIN_PEER_CONNECTION_INTERVAL;
self.min_next_handshake = Instant::now() + constants::MIN_OUTBOUND_PEER_CONNECTION_INTERVAL;

Some(next_peer)
}
Expand Down
15 changes: 9 additions & 6 deletions zebra-network/src/peer_set/candidate_set/tests/prop.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//! Randomised property tests for candidate peer selection.
use std::{
env,
net::SocketAddr,
Expand All @@ -13,7 +15,7 @@ use tracing::Span;
use zebra_chain::{parameters::Network::*, serialization::DateTime32};

use crate::{
constants::MIN_PEER_CONNECTION_INTERVAL,
constants::MIN_OUTBOUND_PEER_CONNECTION_INTERVAL,
meta_addr::{MetaAddr, MetaAddrChange},
AddressBook, BoxError, Request, Response,
};
Expand Down Expand Up @@ -75,7 +77,7 @@ proptest! {
//
// Check that it takes less than the peer set candidate delay,
// and hope that is enough time for test machines with high CPU load.
let less_than_min_interval = MIN_PEER_CONNECTION_INTERVAL - Duration::from_millis(1);
let less_than_min_interval = MIN_OUTBOUND_PEER_CONNECTION_INTERVAL - Duration::from_millis(1);
assert_eq!(runtime.block_on(timeout(less_than_min_interval, candidate_set.next())), Ok(None));
}
}
Expand Down Expand Up @@ -112,7 +114,7 @@ proptest! {
// Check rate limiting for initial peers
check_candidates_rate_limiting(&mut candidate_set, initial_candidates).await;
// Sleep more than the rate limiting delay
sleep(MAX_TEST_CANDIDATES * MIN_PEER_CONNECTION_INTERVAL).await;
sleep(MAX_TEST_CANDIDATES * MIN_OUTBOUND_PEER_CONNECTION_INTERVAL).await;
// Check that the next peers are still respecting the rate limiting, without causing a
// burst of reconnections
check_candidates_rate_limiting(&mut candidate_set, extra_candidates).await;
Expand All @@ -121,7 +123,7 @@ proptest! {
// Allow enough time for the maximum number of candidates,
// plus some extra time for test machines with high CPU load.
// (The max delay asserts usually fail before hitting this timeout.)
let max_rate_limit_sleep = 3 * MAX_TEST_CANDIDATES * MIN_PEER_CONNECTION_INTERVAL;
let max_rate_limit_sleep = 3 * MAX_TEST_CANDIDATES * MIN_OUTBOUND_PEER_CONNECTION_INTERVAL;
let max_extra_delay = (2 * MAX_TEST_CANDIDATES + 1) * MAX_SLEEP_EXTRA_DELAY;
assert!(runtime.block_on(timeout(max_rate_limit_sleep + max_extra_delay, checks)).is_ok());
}
Expand Down Expand Up @@ -161,7 +163,8 @@ where
"rate-limited candidates should not be delayed too long: now: {now:?} max: {maximum_reconnect_instant:?}. Hint: is the test machine overloaded?",
);

minimum_reconnect_instant = now + MIN_PEER_CONNECTION_INTERVAL;
maximum_reconnect_instant = now + MIN_PEER_CONNECTION_INTERVAL + MAX_SLEEP_EXTRA_DELAY;
minimum_reconnect_instant = now + MIN_OUTBOUND_PEER_CONNECTION_INTERVAL;
maximum_reconnect_instant =
now + MIN_OUTBOUND_PEER_CONNECTION_INTERVAL + MAX_SLEEP_EXTRA_DELAY;
}
}
Loading

0 comments on commit 0d50d97

Please sign in to comment.