Skip to content

Commit

Permalink
Restart PQ after nonet
Browse files Browse the repository at this point in the history
There was a bug where after some time of nonet the handshake to the VPN
server would expire and then when the device came back online it would
not be able to re-establish a new handshake because it was trying to
include a preshared key the server no longer knew about. To fix it,
during wg state consolidation, if postquantum is active and the latest
handshake was more than 180 seconds ago (how long after a handshake
wireguard is hardcoded to reject a connection if no new handshake is
established), restart postquantum.
  • Loading branch information
Mathias Peters committed Jan 13, 2025
1 parent 34c1eaa commit d0e91bb
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 35 deletions.
1 change: 1 addition & 0 deletions .unreleased/LLT-5884
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Restart postquantum if the last handshake has passed the wireguard reject timeout
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/telio-pq/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ telio-utils.workspace = true
neptun.workspace = true
pnet_packet.workspace = true
mockall = { workspace = true, optional = true }
parking_lot.workspace = true
rand.workspace = true
thiserror.workspace = true
tokio.workspace = true
Expand Down
92 changes: 59 additions & 33 deletions crates/telio-pq/src/entity.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
use std::net::SocketAddr;
use std::sync::Arc;

use parking_lot::Mutex;

use telio_model::features::FeaturePostQuantumVPN;
use telio_task::io::chan;
use telio_utils::telio_log_debug;

struct Peer {
pubkey: telio_crypto::PublicKey,
wg_secret: telio_crypto::SecretKey,
addr: SocketAddr,
/// This is a key rotation task guard, its `Drop` implementation aborts the task
_rotation_task: super::conn::ConnKeyRotation,
Expand All @@ -17,16 +20,16 @@ pub struct Entity {
features: FeaturePostQuantumVPN,
sockets: Arc<telio_sockets::SocketPool>,
chan: chan::Tx<super::Event>,
peer: Option<Peer>,
peer: Mutex<Option<Peer>>,
}

impl crate::PostQuantum for Entity {
fn keys(&self) -> Option<crate::Keys> {
self.peer.as_ref().and_then(|p| p.keys.clone())
self.peer.lock().as_ref().and_then(|p| p.keys.clone())
}

fn is_rotating_keys(&self) -> bool {
self.peer.is_some()
self.peer.lock().is_some()
}
}

Expand All @@ -40,44 +43,50 @@ impl Entity {
features,
sockets,
chan,
peer: None,
peer: Mutex::new(None),
}
}

pub fn on_event(&mut self, event: super::Event) {
let Some(peer) = &mut self.peer else {
return;
};

match event {
super::Event::Handshake(addr, keys) => {
if peer.addr == addr {
peer.keys = Some(keys);
pub fn on_event(&self, event: super::Event) {
if let Some(peer) = self.peer.lock().as_mut() {
match event {
super::Event::Handshake(addr, keys) => {
if peer.addr == addr {
peer.keys = Some(keys);
}
}
}
super::Event::Rekey(super::Keys {
wg_secret,
pq_shared,
}) => {
if let Some(keys) = &mut peer.keys {
// Check if we are still talking to the same exit node
if keys.wg_secret == wg_secret {
// and only then update the preshared key,
// otherwise we're connecting to different node already
keys.pq_shared = pq_shared;
} else {
telio_log_debug!(
"PQ secret key does not match, ignoring shared secret rotation"
);
super::Event::Rekey(super::Keys {
wg_secret,
pq_shared,
}) => {
if let Some(keys) = &mut peer.keys {
// Check if we are still talking to the same exit node
if keys.wg_secret == wg_secret {
// and only then update the preshared key,
// otherwise we're connecting to different node already
keys.pq_shared = pq_shared;
} else {
telio_log_debug!(
"PQ secret key does not match, ignoring shared secret rotation"
);
}
}
}
_ => (),
}
_ => (),
}
}

pub async fn stop(&mut self) {
if let Some(peer) = self.peer.take() {
pub fn peer_pubkey(&self) -> Option<telio_crypto::PublicKey> {
self.peer.lock().as_ref().map(|p| p.pubkey)
}

// There is some temporary event filtering around disconnected events for postquantum
// That filtering depends heavily on the peer being removed before emitting Disconnected
// here
pub async fn stop(&self) {
let peer = self.peer.lock().take();
if let Some(peer) = peer {
#[allow(mpsc_blocking_send)]
let _ = self
.chan
Expand All @@ -87,15 +96,32 @@ impl Entity {
}

pub async fn start(
&mut self,
&self,
addr: SocketAddr,
wg_secret: telio_crypto::SecretKey,
peer: telio_crypto::PublicKey,
) {
self.stop().await;
self.start_impl(addr, wg_secret, peer).await;
}

self.peer = Some(Peer {
pub async fn restart(&self) {
let peer = self.peer.lock().take();
if let Some(peer) = peer {
self.start_impl(peer.addr, peer.wg_secret, peer.pubkey)
.await;
}
}

async fn start_impl(
&self,
addr: SocketAddr,
wg_secret: telio_crypto::SecretKey,
peer: telio_crypto::PublicKey,
) {
*self.peer.lock() = Some(Peer {
pubkey: peer,
wg_secret: wg_secret.clone(),
addr,
_rotation_task: super::conn::ConnKeyRotation::run(
self.chan.clone(),
Expand Down
39 changes: 39 additions & 0 deletions nat-lab/tests/test_pq.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import config
import pytest
from contextlib import AsyncExitStack
Expand Down Expand Up @@ -510,3 +511,41 @@ async def test_pq_vpn_upgrade_from_non_pq(

preshared = await read_preshared_key_slot(nlx_conn)
assert preshared != EMPTY_PRESHARED_KEY_SLOT


# Regression test for LLT-5884
@pytest.mark.timeout(240)
async def test_pq_vpn_handshake_after_nonet() -> None:
public_ip = "10.0.254.1"
async with AsyncExitStack() as exit_stack:
env = await exit_stack.enter_async_context(
setup_environment(
exit_stack,
[
SetupParameters(
connection_tag=ConnectionTag.DOCKER_CONE_CLIENT_1,
adapter_type_override=TelioAdapterType.NEP_TUN,
is_meshnet=False,
),
],
prepare_vpn=True,
)
)

client_conn, *_ = [conn.connection for conn in env.connections]
client_alpha, *_ = env.clients

ip = await stun.get(client_conn, config.STUN_SERVER)
assert ip == public_ip, f"wrong public IP before connecting to VPN {ip}"

await _connect_vpn_pq(
client_conn,
client_alpha,
)

async with client_alpha.get_router().break_udp_conn_to_host(
str(config.NLX_SERVER["ipv4"])
):
await asyncio.sleep(195)

await ping(client_conn, config.PHOTO_ALBUM_IP, timeout=10)
14 changes: 12 additions & 2 deletions src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ use telio_model::{
constants::{VPN_EXTERNAL_IPV4, VPN_INTERNAL_IPV4},
event::{Event, Set},
features::{FeaturePersistentKeepalive, Features, PathType},
mesh::{ExitNode, LinkState, Node},
mesh::{ExitNode, LinkState, Node, NodeState},
validation::validate_nickname,
EndpointMap,
};
Expand Down Expand Up @@ -2325,6 +2325,16 @@ impl Runtime {
}
}

fn should_supress_disconnected(&self, node: &Node) -> bool {
self.entities
.postquantum_wg
.peer_pubkey()
.map(|pubkey| {
node.public_key == pubkey && matches!(node.state, NodeState::Disconnected)
})
.unwrap_or(false)
}

fn remember_last_transmitted_node_event(&mut self, node: Node) {
if node.state == PeerState::Disconnected {
self.last_transmitted_event.remove(&node.public_key);
Expand Down Expand Up @@ -2389,7 +2399,7 @@ impl TaskRuntime for Runtime {

if let Some(node) = node {
// Publish WG event to app
if !self.is_dublicated_event(&node) {
if !self.is_dublicated_event(&node) && !self.should_supress_disconnected(&node) {
telio_log_debug!("Event is being published to libtelio integrators {node:?}");
let _ = self.event_publishers.libtelio_event_publisher.send(
Box::new(Event::Node {body: node.clone()})
Expand Down
38 changes: 38 additions & 0 deletions src/device/wg_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ pub async fn consolidate_wg_state(
entities: &Entities,
features: &Features,
) -> Result {
maybe_restart_pq(entities).await;

let remote_peer_states = if let Some(meshnet_entities) = entities.meshnet.left() {
meshnet_entities.derp.get_remote_peer_states().await
} else {
Expand Down Expand Up @@ -147,6 +149,42 @@ pub async fn consolidate_wg_state(
Ok(())
}

// Postquantum has a quirk that can cause VPN connections to fail due to the client having a preshared key when the server doesn't
// This can happen if there is a handshake and a preshared key, then the client nonets for more than 180s (wg reject threshold),
// and then tries to handshake again
//
// The purpose of this function is to prevent this by restarting postquantum if the above case is reached. There are two conditions
// that need to be fulfilled for us to restart postquantum:
// 1. postquantum is active (here represented by checking if there is a peer public key)
// 2. the last handshake to the VPN server has passed the hardcoded wireguard reject threshold
// (here represented by comparing against `Some(None)`. The `time_since_last_handshake` will tick up to 180s and then be set to `None` so
// if `None` means that wireguard will reject the connection)
async fn maybe_restart_pq(entities: &Entities) {
let should_restart_pq = match entities.postquantum_wg.peer_pubkey() {
Some(pubkey) => {
let time_since_last_handshake = entities
.wireguard_interface
.get_interface()
.await
.ok()
.and_then(|ifc| {
ifc.peers.iter().find_map(|(_, peer)| {
if peer.public_key == pubkey {
Some(peer.time_since_last_handshake)
} else {
None
}
})
});
matches!(time_since_last_handshake, Some(None))
}
None => false,
};
if should_restart_pq {
entities.postquantum_wg.restart().await;
}
}

async fn consolidate_wg_private_key<W: WireGuard>(
requested_state: &RequestedState,
wireguard_interface: &W,
Expand Down

0 comments on commit d0e91bb

Please sign in to comment.