Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LLT-5884: PQ VPN breaks after some time of nonet #1049

Merged
merged 1 commit into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .unreleased/LLT-5884
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Restart postquantum if the last handshake has passed the wireguard reject timeout
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/telio-pq/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ telio-utils.workspace = true
neptun.workspace = true
pnet_packet.workspace = true
mockall = { workspace = true, optional = true }
parking_lot.workspace = true
rand.workspace = true
thiserror.workspace = true
tokio.workspace = true
Expand Down
92 changes: 59 additions & 33 deletions crates/telio-pq/src/entity.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
use std::net::SocketAddr;
use std::sync::Arc;

use parking_lot::Mutex;

use telio_model::features::FeaturePostQuantumVPN;
use telio_task::io::chan;
use telio_utils::telio_log_debug;

struct Peer {
pubkey: telio_crypto::PublicKey,
wg_secret: telio_crypto::SecretKey,
addr: SocketAddr,
/// This is a key rotation task guard, its `Drop` implementation aborts the task
_rotation_task: super::conn::ConnKeyRotation,
Expand All @@ -17,16 +20,16 @@ pub struct Entity {
features: FeaturePostQuantumVPN,
sockets: Arc<telio_sockets::SocketPool>,
chan: chan::Tx<super::Event>,
peer: Option<Peer>,
peer: Mutex<Option<Peer>>,
mathiaspeters marked this conversation as resolved.
Show resolved Hide resolved
}

impl crate::PostQuantum for Entity {
fn keys(&self) -> Option<crate::Keys> {
self.peer.as_ref().and_then(|p| p.keys.clone())
self.peer.lock().as_ref().and_then(|p| p.keys.clone())
}

fn is_rotating_keys(&self) -> bool {
self.peer.is_some()
self.peer.lock().is_some()
}
}

Expand All @@ -40,44 +43,50 @@ impl Entity {
features,
sockets,
chan,
peer: None,
peer: Mutex::new(None),
}
}

pub fn on_event(&mut self, event: super::Event) {
let Some(peer) = &mut self.peer else {
return;
};

match event {
super::Event::Handshake(addr, keys) => {
if peer.addr == addr {
peer.keys = Some(keys);
pub fn on_event(&self, event: super::Event) {
if let Some(peer) = self.peer.lock().as_mut() {
mathiaspeters marked this conversation as resolved.
Show resolved Hide resolved
match event {
super::Event::Handshake(addr, keys) => {
if peer.addr == addr {
peer.keys = Some(keys);
}
}
}
super::Event::Rekey(super::Keys {
wg_secret,
pq_shared,
}) => {
if let Some(keys) = &mut peer.keys {
// Check if we are still talking to the same exit node
if keys.wg_secret == wg_secret {
// and only then update the preshared key,
// otherwise we're connecting to different node already
keys.pq_shared = pq_shared;
} else {
telio_log_debug!(
"PQ secret key does not match, ignoring shared secret rotation"
);
super::Event::Rekey(super::Keys {
wg_secret,
pq_shared,
}) => {
if let Some(keys) = &mut peer.keys {
// Check if we are still talking to the same exit node
if keys.wg_secret == wg_secret {
// and only then update the preshared key,
// otherwise we're connecting to different node already
keys.pq_shared = pq_shared;
} else {
telio_log_debug!(
"PQ secret key does not match, ignoring shared secret rotation"
);
}
}
}
_ => (),
}
_ => (),
}
}

pub async fn stop(&mut self) {
if let Some(peer) = self.peer.take() {
pub fn peer_pubkey(&self) -> Option<telio_crypto::PublicKey> {
self.peer.lock().as_ref().map(|p| p.pubkey)
}

// There is some temporary event filtering around disconnected events for postquantum
// That filtering depends heavily on the peer being removed before emitting Disconnected
// here
pub async fn stop(&self) {
let peer = self.peer.lock().take();
if let Some(peer) = peer {
#[allow(mpsc_blocking_send)]
let _ = self
.chan
Expand All @@ -87,15 +96,32 @@ impl Entity {
}

pub async fn start(
&mut self,
&self,
addr: SocketAddr,
wg_secret: telio_crypto::SecretKey,
peer: telio_crypto::PublicKey,
) {
self.stop().await;
self.start_impl(addr, wg_secret, peer).await;
}

self.peer = Some(Peer {
pub async fn restart(&self) {
let peer = self.peer.lock().take();
if let Some(peer) = peer {
self.start_impl(peer.addr, peer.wg_secret, peer.pubkey)
.await;
}
}

async fn start_impl(
&self,
addr: SocketAddr,
wg_secret: telio_crypto::SecretKey,
peer: telio_crypto::PublicKey,
) {
*self.peer.lock() = Some(Peer {
pubkey: peer,
wg_secret: wg_secret.clone(),
addr,
_rotation_task: super::conn::ConnKeyRotation::run(
self.chan.clone(),
Expand Down
51 changes: 45 additions & 6 deletions nat-lab/tests/test_pq.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import config
import pytest
from contextlib import AsyncExitStack
Expand Down Expand Up @@ -89,7 +90,7 @@ async def inspect_preshared_key(nlx_conn: Connection) -> str:
connection_tracker_config=generate_connection_tracker_config(
ConnectionTag.WINDOWS_VM_1,
stun_limits=(1, 1),
nlx_1_limits=(1, 2),
nlx_1_limits=(1, 3),
),
is_meshnet=False,
),
Expand All @@ -105,7 +106,7 @@ async def inspect_preshared_key(nlx_conn: Connection) -> str:
connection_tracker_config=generate_connection_tracker_config(
ConnectionTag.WINDOWS_VM_1,
stun_limits=(1, 1),
nlx_1_limits=(1, 2),
nlx_1_limits=(1, 3),
),
is_meshnet=False,
),
Expand Down Expand Up @@ -136,7 +137,7 @@ async def test_pq_vpn_connection(
) -> None:
async with AsyncExitStack() as exit_stack:
env = await exit_stack.enter_async_context(
setup_environment(exit_stack, [alpha_setup_params])
setup_environment(exit_stack, [alpha_setup_params], prepare_vpn=True)
mathiaspeters marked this conversation as resolved.
Show resolved Hide resolved
)

client_conn, *_ = [conn.connection for conn in env.connections]
Expand Down Expand Up @@ -188,7 +189,7 @@ async def test_pq_vpn_connection(
connection_tracker_config=generate_connection_tracker_config(
ConnectionTag.WINDOWS_VM_1,
stun_limits=(1, 1),
nlx_1_limits=(1, 2),
nlx_1_limits=(1, 3),
),
is_meshnet=False,
),
Expand All @@ -204,7 +205,7 @@ async def test_pq_vpn_connection(
connection_tracker_config=generate_connection_tracker_config(
ConnectionTag.WINDOWS_VM_1,
stun_limits=(1, 1),
nlx_1_limits=(1, 2),
nlx_1_limits=(1, 3),
),
is_meshnet=False,
),
Expand Down Expand Up @@ -238,7 +239,7 @@ async def test_pq_vpn_rekey(

async with AsyncExitStack() as exit_stack:
env = await exit_stack.enter_async_context(
setup_environment(exit_stack, [alpha_setup_params])
setup_environment(exit_stack, [alpha_setup_params], prepare_vpn=True)
mathiaspeters marked this conversation as resolved.
Show resolved Hide resolved
)

client_conn, *_ = [conn.connection for conn in env.connections]
Expand Down Expand Up @@ -510,3 +511,41 @@ async def test_pq_vpn_upgrade_from_non_pq(

preshared = await read_preshared_key_slot(nlx_conn)
assert preshared != EMPTY_PRESHARED_KEY_SLOT


# Regression test for LLT-5884
@pytest.mark.timeout(240)
async def test_pq_vpn_handshake_after_nonet() -> None:
public_ip = "10.0.254.1"
async with AsyncExitStack() as exit_stack:
env = await exit_stack.enter_async_context(
setup_environment(
exit_stack,
[
SetupParameters(
connection_tag=ConnectionTag.DOCKER_CONE_CLIENT_1,
adapter_type_override=TelioAdapterType.NEP_TUN,
is_meshnet=False,
),
],
prepare_vpn=True,
)
)

client_conn, *_ = [conn.connection for conn in env.connections]
client_alpha, *_ = env.clients

ip = await stun.get(client_conn, config.STUN_SERVER)
assert ip == public_ip, f"wrong public IP before connecting to VPN {ip}"

await _connect_vpn_pq(
client_conn,
client_alpha,
)

async with client_alpha.get_router().break_udp_conn_to_host(
str(config.NLX_SERVER["ipv4"])
):
await asyncio.sleep(195)
mathiaspeters marked this conversation as resolved.
Show resolved Hide resolved

await ping(client_conn, config.PHOTO_ALBUM_IP, timeout=10)
mathiaspeters marked this conversation as resolved.
Show resolved Hide resolved
14 changes: 12 additions & 2 deletions src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ use telio_model::{
constants::{VPN_EXTERNAL_IPV4, VPN_INTERNAL_IPV4},
event::{Event, Set},
features::{FeaturePersistentKeepalive, Features, PathType},
mesh::{ExitNode, LinkState, Node},
mesh::{ExitNode, LinkState, Node, NodeState},
validation::validate_nickname,
EndpointMap,
};
Expand Down Expand Up @@ -2325,6 +2325,16 @@ impl Runtime {
}
}

fn should_supress_disconnected(&self, node: &Node) -> bool {
self.entities
.postquantum_wg
.peer_pubkey()
.map(|pubkey| {
node.public_key == pubkey && matches!(node.state, NodeState::Disconnected)
})
.unwrap_or(false)
}

fn remember_last_transmitted_node_event(&mut self, node: Node) {
if node.state == PeerState::Disconnected {
self.last_transmitted_event.remove(&node.public_key);
Expand Down Expand Up @@ -2389,7 +2399,7 @@ impl TaskRuntime for Runtime {

if let Some(node) = node {
// Publish WG event to app
if !self.is_dublicated_event(&node) {
if !self.is_dublicated_event(&node) && !self.should_supress_disconnected(&node) {
telio_log_debug!("Event is being published to libtelio integrators {node:?}");
let _ = self.event_publishers.libtelio_event_publisher.send(
Box::new(Event::Node {body: node.clone()})
Expand Down
38 changes: 38 additions & 0 deletions src/device/wg_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ pub async fn consolidate_wg_state(
entities: &Entities,
features: &Features,
) -> Result {
maybe_restart_pq(entities).await;

let remote_peer_states = if let Some(meshnet_entities) = entities.meshnet.left() {
meshnet_entities.derp.get_remote_peer_states().await
} else {
Expand Down Expand Up @@ -147,6 +149,42 @@ pub async fn consolidate_wg_state(
Ok(())
}

// Postquantum has a quirk that can cause VPN connections to fail due to the client having a preshared key when the server doesn't
// This can happen if there is a handshake and a preshared key, then the client nonets for more than 180s (wg reject threshold),
// and then tries to handshake again
//
// The purpose of this function is to prevent this by restarting postquantum if the above case is reached. There are two conditions
// that need to be fulfilled for us to restart postquantum:
// 1. postquantum is active (here represented by checking if there is a peer public key)
// 2. the last handshake to the VPN server has passed the hardcoded wireguard reject threshold
// (here represented by comparing against `Some(None)`. The `time_since_last_handshake` will tick up to 180s and then be set to `None` so
// if `None` means that wireguard will reject the connection)
async fn maybe_restart_pq(entities: &Entities) {
let should_restart_pq = match entities.postquantum_wg.peer_pubkey() {
Some(pubkey) => {
let time_since_last_handshake = entities
.wireguard_interface
.get_interface()
.await
.ok()
.and_then(|ifc| {
ifc.peers.iter().find_map(|(_, peer)| {
if peer.public_key == pubkey {
Some(peer.time_since_last_handshake)
} else {
None
}
})
});
matches!(time_since_last_handshake, Some(None))
}
None => false,
};
if should_restart_pq {
entities.postquantum_wg.restart().await;
}
}

async fn consolidate_wg_private_key<W: WireGuard>(
requested_state: &RequestedState,
wireguard_interface: &W,
Expand Down
Loading