Skip to content

Commit

Permalink
Merge pull request #1049 from NordSecurity/LLT-5884_pq_break_after_nonet
Browse files Browse the repository at this point in the history
LLT-5884: PQ VPN breaks after some time of nonet
  • Loading branch information
mathiaspeters authored Jan 13, 2025
2 parents 34c1eaa + f9e3e38 commit 9d60ac0
Show file tree
Hide file tree
Showing 7 changed files with 157 additions and 41 deletions.
1 change: 1 addition & 0 deletions .unreleased/LLT-5884
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Restart postquantum if the last handshake has passed the wireguard reject timeout
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/telio-pq/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ telio-utils.workspace = true
neptun.workspace = true
pnet_packet.workspace = true
mockall = { workspace = true, optional = true }
parking_lot.workspace = true
rand.workspace = true
thiserror.workspace = true
tokio.workspace = true
Expand Down
92 changes: 59 additions & 33 deletions crates/telio-pq/src/entity.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
use std::net::SocketAddr;
use std::sync::Arc;

use parking_lot::Mutex;

use telio_model::features::FeaturePostQuantumVPN;
use telio_task::io::chan;
use telio_utils::telio_log_debug;

struct Peer {
pubkey: telio_crypto::PublicKey,
wg_secret: telio_crypto::SecretKey,
addr: SocketAddr,
/// This is a key rotation task guard, its `Drop` implementation aborts the task
_rotation_task: super::conn::ConnKeyRotation,
Expand All @@ -17,16 +20,16 @@ pub struct Entity {
features: FeaturePostQuantumVPN,
sockets: Arc<telio_sockets::SocketPool>,
chan: chan::Tx<super::Event>,
peer: Option<Peer>,
peer: Mutex<Option<Peer>>,
}

impl crate::PostQuantum for Entity {
fn keys(&self) -> Option<crate::Keys> {
self.peer.as_ref().and_then(|p| p.keys.clone())
self.peer.lock().as_ref().and_then(|p| p.keys.clone())
}

fn is_rotating_keys(&self) -> bool {
self.peer.is_some()
self.peer.lock().is_some()
}
}

Expand All @@ -40,44 +43,50 @@ impl Entity {
features,
sockets,
chan,
peer: None,
peer: Mutex::new(None),
}
}

pub fn on_event(&mut self, event: super::Event) {
let Some(peer) = &mut self.peer else {
return;
};

match event {
super::Event::Handshake(addr, keys) => {
if peer.addr == addr {
peer.keys = Some(keys);
pub fn on_event(&self, event: super::Event) {
if let Some(peer) = self.peer.lock().as_mut() {
match event {
super::Event::Handshake(addr, keys) => {
if peer.addr == addr {
peer.keys = Some(keys);
}
}
}
super::Event::Rekey(super::Keys {
wg_secret,
pq_shared,
}) => {
if let Some(keys) = &mut peer.keys {
// Check if we are still talking to the same exit node
if keys.wg_secret == wg_secret {
// and only then update the preshared key,
// otherwise we're connecting to different node already
keys.pq_shared = pq_shared;
} else {
telio_log_debug!(
"PQ secret key does not match, ignoring shared secret rotation"
);
super::Event::Rekey(super::Keys {
wg_secret,
pq_shared,
}) => {
if let Some(keys) = &mut peer.keys {
// Check if we are still talking to the same exit node
if keys.wg_secret == wg_secret {
// and only then update the preshared key,
// otherwise we're connecting to different node already
keys.pq_shared = pq_shared;
} else {
telio_log_debug!(
"PQ secret key does not match, ignoring shared secret rotation"
);
}
}
}
_ => (),
}
_ => (),
}
}

pub async fn stop(&mut self) {
if let Some(peer) = self.peer.take() {
pub fn peer_pubkey(&self) -> Option<telio_crypto::PublicKey> {
self.peer.lock().as_ref().map(|p| p.pubkey)
}

// There is some temporary event filtering around disconnected events for postquantum
// That filtering depends heavily on the peer being removed before emitting Disconnected
// here
pub async fn stop(&self) {
let peer = self.peer.lock().take();
if let Some(peer) = peer {
#[allow(mpsc_blocking_send)]
let _ = self
.chan
Expand All @@ -87,15 +96,32 @@ impl Entity {
}

pub async fn start(
&mut self,
&self,
addr: SocketAddr,
wg_secret: telio_crypto::SecretKey,
peer: telio_crypto::PublicKey,
) {
self.stop().await;
self.start_impl(addr, wg_secret, peer).await;
}

self.peer = Some(Peer {
pub async fn restart(&self) {
let peer = self.peer.lock().take();
if let Some(peer) = peer {
self.start_impl(peer.addr, peer.wg_secret, peer.pubkey)
.await;
}
}

async fn start_impl(
&self,
addr: SocketAddr,
wg_secret: telio_crypto::SecretKey,
peer: telio_crypto::PublicKey,
) {
*self.peer.lock() = Some(Peer {
pubkey: peer,
wg_secret: wg_secret.clone(),
addr,
_rotation_task: super::conn::ConnKeyRotation::run(
self.chan.clone(),
Expand Down
51 changes: 45 additions & 6 deletions nat-lab/tests/test_pq.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import config
import pytest
from contextlib import AsyncExitStack
Expand Down Expand Up @@ -89,7 +90,7 @@ async def inspect_preshared_key(nlx_conn: Connection) -> str:
connection_tracker_config=generate_connection_tracker_config(
ConnectionTag.WINDOWS_VM_1,
stun_limits=(1, 1),
nlx_1_limits=(1, 2),
nlx_1_limits=(1, 3),
),
is_meshnet=False,
),
Expand All @@ -105,7 +106,7 @@ async def inspect_preshared_key(nlx_conn: Connection) -> str:
connection_tracker_config=generate_connection_tracker_config(
ConnectionTag.WINDOWS_VM_1,
stun_limits=(1, 1),
nlx_1_limits=(1, 2),
nlx_1_limits=(1, 3),
),
is_meshnet=False,
),
Expand Down Expand Up @@ -136,7 +137,7 @@ async def test_pq_vpn_connection(
) -> None:
async with AsyncExitStack() as exit_stack:
env = await exit_stack.enter_async_context(
setup_environment(exit_stack, [alpha_setup_params])
setup_environment(exit_stack, [alpha_setup_params], prepare_vpn=True)
)

client_conn, *_ = [conn.connection for conn in env.connections]
Expand Down Expand Up @@ -188,7 +189,7 @@ async def test_pq_vpn_connection(
connection_tracker_config=generate_connection_tracker_config(
ConnectionTag.WINDOWS_VM_1,
stun_limits=(1, 1),
nlx_1_limits=(1, 2),
nlx_1_limits=(1, 3),
),
is_meshnet=False,
),
Expand All @@ -204,7 +205,7 @@ async def test_pq_vpn_connection(
connection_tracker_config=generate_connection_tracker_config(
ConnectionTag.WINDOWS_VM_1,
stun_limits=(1, 1),
nlx_1_limits=(1, 2),
nlx_1_limits=(1, 3),
),
is_meshnet=False,
),
Expand Down Expand Up @@ -238,7 +239,7 @@ async def test_pq_vpn_rekey(

async with AsyncExitStack() as exit_stack:
env = await exit_stack.enter_async_context(
setup_environment(exit_stack, [alpha_setup_params])
setup_environment(exit_stack, [alpha_setup_params], prepare_vpn=True)
)

client_conn, *_ = [conn.connection for conn in env.connections]
Expand Down Expand Up @@ -510,3 +511,41 @@ async def test_pq_vpn_upgrade_from_non_pq(

preshared = await read_preshared_key_slot(nlx_conn)
assert preshared != EMPTY_PRESHARED_KEY_SLOT


# Regression test for LLT-5884
@pytest.mark.timeout(240)
async def test_pq_vpn_handshake_after_nonet() -> None:
public_ip = "10.0.254.1"
async with AsyncExitStack() as exit_stack:
env = await exit_stack.enter_async_context(
setup_environment(
exit_stack,
[
SetupParameters(
connection_tag=ConnectionTag.DOCKER_CONE_CLIENT_1,
adapter_type_override=TelioAdapterType.NEP_TUN,
is_meshnet=False,
),
],
prepare_vpn=True,
)
)

client_conn, *_ = [conn.connection for conn in env.connections]
client_alpha, *_ = env.clients

ip = await stun.get(client_conn, config.STUN_SERVER)
assert ip == public_ip, f"wrong public IP before connecting to VPN {ip}"

await _connect_vpn_pq(
client_conn,
client_alpha,
)

async with client_alpha.get_router().break_udp_conn_to_host(
str(config.NLX_SERVER["ipv4"])
):
await asyncio.sleep(195)

await ping(client_conn, config.PHOTO_ALBUM_IP, timeout=10)
14 changes: 12 additions & 2 deletions src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ use telio_model::{
constants::{VPN_EXTERNAL_IPV4, VPN_INTERNAL_IPV4},
event::{Event, Set},
features::{FeaturePersistentKeepalive, Features, PathType},
mesh::{ExitNode, LinkState, Node},
mesh::{ExitNode, LinkState, Node, NodeState},
validation::validate_nickname,
EndpointMap,
};
Expand Down Expand Up @@ -2325,6 +2325,16 @@ impl Runtime {
}
}

fn should_supress_disconnected(&self, node: &Node) -> bool {
self.entities
.postquantum_wg
.peer_pubkey()
.map(|pubkey| {
node.public_key == pubkey && matches!(node.state, NodeState::Disconnected)
})
.unwrap_or(false)
}

fn remember_last_transmitted_node_event(&mut self, node: Node) {
if node.state == PeerState::Disconnected {
self.last_transmitted_event.remove(&node.public_key);
Expand Down Expand Up @@ -2389,7 +2399,7 @@ impl TaskRuntime for Runtime {

if let Some(node) = node {
// Publish WG event to app
if !self.is_dublicated_event(&node) {
if !self.is_dublicated_event(&node) && !self.should_supress_disconnected(&node) {
telio_log_debug!("Event is being published to libtelio integrators {node:?}");
let _ = self.event_publishers.libtelio_event_publisher.send(
Box::new(Event::Node {body: node.clone()})
Expand Down
38 changes: 38 additions & 0 deletions src/device/wg_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ pub async fn consolidate_wg_state(
entities: &Entities,
features: &Features,
) -> Result {
maybe_restart_pq(entities).await;

let remote_peer_states = if let Some(meshnet_entities) = entities.meshnet.left() {
meshnet_entities.derp.get_remote_peer_states().await
} else {
Expand Down Expand Up @@ -147,6 +149,42 @@ pub async fn consolidate_wg_state(
Ok(())
}

// Postquantum has a quirk that can cause VPN connections to fail due to the client having a preshared key when the server doesn't
// This can happen if there is a handshake and a preshared key, then the client nonets for more than 180s (wg reject threshold),
// and then tries to handshake again
//
// The purpose of this function is to prevent this by restarting postquantum if the above case is reached. There are two conditions
// that need to be fulfilled for us to restart postquantum:
// 1. postquantum is active (here represented by checking if there is a peer public key)
// 2. the last handshake to the VPN server has passed the hardcoded wireguard reject threshold
// (here represented by comparing against `Some(None)`. The `time_since_last_handshake` will tick up to 180s and then be set to `None` so
// if `None` means that wireguard will reject the connection)
async fn maybe_restart_pq(entities: &Entities) {
let should_restart_pq = match entities.postquantum_wg.peer_pubkey() {
Some(pubkey) => {
let time_since_last_handshake = entities
.wireguard_interface
.get_interface()
.await
.ok()
.and_then(|ifc| {
ifc.peers.iter().find_map(|(_, peer)| {
if peer.public_key == pubkey {
Some(peer.time_since_last_handshake)
} else {
None
}
})
});
matches!(time_since_last_handshake, Some(None))
}
None => false,
};
if should_restart_pq {
entities.postquantum_wg.restart().await;
}
}

async fn consolidate_wg_private_key<W: WireGuard>(
requested_state: &RequestedState,
wireguard_interface: &W,
Expand Down

0 comments on commit 9d60ac0

Please sign in to comment.