Skip to content

Commit

Permalink
ext-ip-hack: fwd all ext traffic to phys gateway
Browse files Browse the repository at this point in the history
This modifies the hacked up NAT layer to always rewrite external
traffic to the local gateway on the "external" subnet. When the
destination is another IP on the subnet the gateway will just forward
it back onto the subnet. In the case of an off-subnet destination, the
gateway will forward it to the next hop. This allows bouth inbound and
outbound external traffic to work with the guest.

I also tried to get intra-guest (aka internal VPC traffic) to work
under the hack but it's fighting me and we want to land what's here
for the next demo.

I also modified the port process DTrace script to print more useful
info. And I added a start to a flowchart for processing flow.
  • Loading branch information
rzezeski committed Jul 7, 2022
1 parent 6bc29a9 commit 92a3ae2
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 38 deletions.
18 changes: 9 additions & 9 deletions dtrace/opte-port-process.d
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
*/
#include "common.h"

#define HDR_FMT "%-12s %-3s %-8s %-43s %-18s %s\n"
#define LINE_FMT "%-12s %-3s %-8u %-43s 0x%-16p %s\n"
#define HDR_FMT "%-12s %-3s %-8s %-43s %-5s %s\n"
#define LINE_FMT "%-12s %-3s %-8u %-43s %-5u %s\n"

BEGIN {
/*
Expand All @@ -18,7 +18,7 @@ BEGIN {
protos[17] = "UDP";
protos[255] = "XXX";

printf(HDR_FMT, "NAME", "DIR", "EPOCH", "FLOW", "MBLK", "RESULT");
printf(HDR_FMT, "NAME", "DIR", "EPOCH", "FLOW", "LEN", "RESULT");
num = 0;
}

Expand All @@ -27,11 +27,11 @@ port-process-return {
this->name = stringof(arg1);
this->flow = (flow_id_sdt_arg_t *)arg2;
this->epoch = arg3;
this->mp = arg4;
this->mp = (mblk_t *)arg4;
this->res = stringof(arg5);

if (num >= 10) {
printf(HDR_FMT, "NAME", "DIR", "EPOCH", "FLOW", "MBLK",
printf(HDR_FMT, "NAME", "DIR", "EPOCH", "FLOW", "LEN",
"RESULT");
num = 0;
}
Expand All @@ -45,15 +45,15 @@ port-process-return {

port-process-return /this->af == AF_INET/ {
FLOW_FMT(this->s, this->flow);
printf(LINE_FMT, this->name, this->dir, this->epoch, this->s, this->mp,
this->res);
printf(LINE_FMT, this->name, this->dir, this->epoch, this->s,
msgsize(this->mp), this->res);
num++;
}

port-process-return /this->af == AF_INET6/ {
FLOW_FMT6(this->s, this->flow);
printf(LINE_FMT, this->name, this->dir, this->epoch, this->s, this->mp,
this->res);
printf(LINE_FMT, this->name, this->dir, this->epoch, this->s,
msgsize(this->mp), this->res);
num++;
}

Expand Down
30 changes: 30 additions & 0 deletions opte/process-flow.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
The size of the TCP flow table is currently 8096.

```mermaid
flowchart TD
process_in([process_in]) --> is_def_id{flow_id == FLOW_ID_DEFAULT?};
is_def_id -- Yes --> lp[layers_process];
is_def_id -- No --> check_uft{UFT entry?};
check_uft -- Yes --> same_epoch{entry.epoch == port.epoch?};
check_uft -- No --> lp;
same_epoch -- Yes --> run_ht[run HT];
same_epoch -- No --> inv[invalidate UFT entry];
inv --> lp;
run_ht --> is_tcp_uft{TCP?};
is_tcp_uft -- Yes --> pite[process_in_tcp_existing];
is_tcp_uft -- No --> rm([return Modified]);
lp --> lr{Layer Result?};
lr -- Allow --> uft_add[add UFT entry];
lr -- Deny --> rd([return Drop]);
lr -- "Hairpin(hp)" --> rhp(["return Hairpin(hp)"]);
lr -- "Err(e)" --> re(["return Err(e)"]);
pitn -- "Ok(TcpState::Closed)" --> rd;
pitn -- "Ok(tcp_state)" --> rm;
pitn -- "Err(e)" --> re;
pite -- "Ok(TcpState::Closed)" --> rd;
pite -- "Ok(tcp_state)" --> rm;
pite -- "Err(e)" --> re;
uft_add --> is_tcp_no_uft{TCP?};
is_tcp_no_uft -- Yes --> pitn[process_in_tcp_new];
is_tcp_no_uft -- No --> rm;
```
9 changes: 6 additions & 3 deletions opte/src/engine/int_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ fn lab_cfg() -> PortCfg {
snat: Some(SNatCfg {
public_ip: "76.76.21.21".parse().unwrap(),
ports: 1025..=4096,
phys_gw_mac: MacAddr::from([0x78, 0x23, 0xae, 0x5d, 0x4f, 0x0d]),
}),
gw_mac: MacAddr::from([0xAA, 0x00, 0x04, 0x00, 0xFF, 0x01]),
gw_ip: "172.20.14.1".parse().unwrap(),
Expand Down Expand Up @@ -179,8 +180,9 @@ fn g1_cfg() -> PortCfg {
// which the oxide Rack is simply a part of.
public_ip: "10.77.77.13".parse().unwrap(),
ports: 1025..=4096,
phys_gw_mac: MacAddr::from([0x78, 0x23, 0xae, 0x5d, 0x4f, 0x0d]),
}),
gw_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xF7, 0x00, 0x1]),
gw_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xFF, 0x77, 0x77]),
gw_ip: "192.168.77.1".parse().unwrap(),
vni: Vni::new(99u32).unwrap(),
// Site 0xF7, Rack 1, Sled 1, Interface 1
Expand All @@ -202,7 +204,7 @@ fn g1_cfg() -> PortCfg {
fn g2_cfg() -> PortCfg {
PortCfg {
private_ip: "192.168.77.102".parse().unwrap(),
private_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xF7, 0x00, 0x66]),
private_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xF0, 0x00, 0x66]),
vpc_subnet: "192.168.77.0/24".parse().unwrap(),
snat: Some(SNatCfg {
// NOTE: This is not a routable IP, but remember that a
Expand All @@ -211,8 +213,9 @@ fn g2_cfg() -> PortCfg {
// which the oxide Rack is simply a part of.
public_ip: "10.77.77.23".parse().unwrap(),
ports: 4097..=8192,
phys_gw_mac: MacAddr::from([0x78, 0x23, 0xae, 0x5d, 0x4f, 0x0d]),
}),
gw_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xF7, 0x00, 0x1]),
gw_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xFF, 0x77, 0x77]),
gw_ip: "192.168.77.1".parse().unwrap(),
vni: Vni::new(99u32).unwrap(),
// Site 0xF7, Rack 1, Sled 22, Interface 1
Expand Down
44 changes: 26 additions & 18 deletions opte/src/engine/nat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,20 @@ use crate::api::{Direction, Ipv4Addr, MacAddr};
pub struct Nat4 {
priv_ip: Ipv4Addr,
public_ip: Ipv4Addr,
phys_gw_mac: MacAddr,
}

impl Nat4 {
pub fn new(priv_ip: Ipv4Addr, public_ip: Ipv4Addr) -> Self {
Self { priv_ip: priv_ip.into(), public_ip: public_ip.into() }
pub fn new(
priv_ip: Ipv4Addr,
public_ip: Ipv4Addr,
phys_gw_mac: MacAddr,
) -> Self {
Self {
priv_ip: priv_ip.into(),
public_ip: public_ip.into(),
phys_gw_mac,
}
}
}

Expand All @@ -51,15 +60,14 @@ impl StatefulAction for Nat4 {
_flow_id: &InnerFlowId,
meta: &mut Meta,
) -> rule::GenDescResult {
let mac_addr = meta.get::<MacAddr>();
let desc = Nat4Desc {
priv_ip: self.priv_ip,
public_ip: self.public_ip,
// XXX-EXT-IP This is assuming ext_ip_hack and will only
// allow for inbound connections, this will not work for
// outbound. If we want that we'll want to actually query
// the native router/ARP table.
src_mac: mac_addr.cloned(),
// XXX-EXT-IP This is assuming ext_ip_hack. All packets
// outbound for IG will have their dest mac rewritten to
// go to physical gateway, which will then properly route
// the destination IP.
phys_gw_mac: self.phys_gw_mac.clone(),
};
Ok(AllowOrDeny::Allow(Arc::new(desc)))
}
Expand All @@ -77,7 +85,7 @@ pub struct Nat4Desc {
priv_ip: Ipv4Addr,
public_ip: Ipv4Addr,
// XXX-EXT-IP
src_mac: Option<MacAddr>,
phys_gw_mac: MacAddr,
}

pub const NAT4_NAME: &'static str = "NAT4";
Expand All @@ -97,11 +105,10 @@ impl ActionDesc for Nat4Desc {
};

// XXX-EXT-IP hack to rewrite destination MAC adress
// from virtual gateway addr to actual address that
// initiated connection.
if self.src_mac.is_some() {
ht.inner_ether = EtherMeta::modify(None, self.src_mac);
}
// from virtual gateway addr to the real gateway addr
// on the same subnet as the external IP.
ht.inner_ether =
EtherMeta::modify(None, Some(self.phys_gw_mac));
ht
}

Expand Down Expand Up @@ -131,14 +138,15 @@ mod test {
use crate::engine::packet::{MetaGroup, PacketMeta};
use crate::engine::tcp::TcpMeta;

let priv_mac = MacAddr::from([0x02, 0x08, 0x20, 0xd8, 0x35, 0xcf]);
let dest_mac = MacAddr::from([0x78, 0x23, 0xae, 0x5d, 0x4f, 0x0d]);
let priv_mac = MacAddr::from([0xA8, 0x40, 0x25, 0xF0, 0x00, 0x01]);
let dest_mac = MacAddr::from([0xA8, 0x40, 0x25, 0xFF, 0x77, 0x77]);
let priv_ip = "10.0.0.220".parse().unwrap();
let priv_port = "4999".parse().unwrap();
let pub_ip = "52.10.128.69".parse().unwrap();
let outside_ip = "76.76.21.21".parse().unwrap();
let outside_port = 80;
let nat = Nat4::new(priv_ip, pub_ip);
let gw_mac = MacAddr::from([0x78, 0x23, 0xae, 0x5d, 0x4f, 0x0d]);
let nat = Nat4::new(priv_ip, pub_ip, gw_mac);
let mut port_meta = Meta::new();

// ================================================================
Expand Down Expand Up @@ -189,7 +197,7 @@ mod test {

let ether_meta = pmo.inner.ether.as_ref().unwrap();
assert_eq!(ether_meta.src, priv_mac);
assert_eq!(ether_meta.dst, dest_mac);
assert_eq!(ether_meta.dst, gw_mac);

let ip4_meta = match pmo.inner.ip.as_ref().unwrap() {
IpMeta::Ip4(v) => v,
Expand Down
1 change: 1 addition & 0 deletions opte/src/oxide_vpc/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ pub struct CreateXdeReq {
pub struct SNatCfg {
pub public_ip: Ipv4Addr,
pub ports: core::ops::RangeInclusive<u16>,
pub phys_gw_mac: MacAddr,
}

/// Xde delete ioctl parameter data.
Expand Down
6 changes: 5 additions & 1 deletion opte/src/oxide_vpc/engine/nat4.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ pub fn setup(
// XXX-EXT-IP This config should not some from SNAT. This is
// currently a hack assuming its use is in service of the
// ext_ip_hack flag.
let nat = Nat4::new(cfg.private_ip, cfg.snat.as_ref().unwrap().public_ip);
let nat = Nat4::new(
cfg.private_ip,
cfg.snat.as_ref().unwrap().public_ip,
cfg.snat.as_ref().unwrap().phys_gw_mac,
);
let layer = Layer::new(
NAT4_LAYER_NAME,
pb.name(),
Expand Down
5 changes: 5 additions & 0 deletions opteadm/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ enum Command {
#[structopt(long)]
snat_end: Option<u16>,

#[structopt(long)]
snat_gw_mac: Option<MacAddr>,

#[structopt(long)]
passthrough: bool,
},
Expand Down Expand Up @@ -498,6 +501,7 @@ fn main() {
snat_ip,
snat_start,
snat_end,
snat_gw_mac,
passthrough,
} => {
let hdl = opteadm::OpteAdm::open(OpteAdm::DLD_CTL).unwrap_or_die();
Expand All @@ -508,6 +512,7 @@ fn main() {
snat_start.unwrap(),
snat_end.unwrap(),
),
phys_gw_mac: snat_gw_mac.unwrap(),
}),

None => None,
Expand Down
57 changes: 50 additions & 7 deletions xde/src/xde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,33 @@ fn guest_loopback(
let devs = unsafe { xde_devs.read() };
let ether_dst = pkt.headers().inner.ether.dst();

match devs.iter().find(|x| x.vni == vni && x.port.mac_addr() == ether_dst) {
let maybe_dest_dev = if unsafe { xde_ext_ip_hack == 1 } {
let ip = pkt.headers().inner.ip.as_ref();
match ip {
None => None,
Some(ip_hdr) => {
// XXX Doing all these shenanigans because we don't
// have the overlay layer in place which would
// normally rewrite the dst MAC addr to that of the
// dest guest.
//
// XXX Sigh, this still needs more work because we
// really do need to rewirte the dst mac, but I'm in a
// rush trying to get things working for a demo and I
// have too many yaks in my office. Come back to this
// later.
if let Some(ip4) = ip_hdr.ip4() {
devs.iter().find(|x| x.port_cfg.private_ip == ip4.dst())
} else {
None
}
}
}
} else {
devs.iter().find(|x| x.vni == vni && x.port.mac_addr() == ether_dst)
};

match maybe_dest_dev {
Some(dest_dev) => {
// We have found a matching Port on this host; "loop back"
// the packet into the inbound processing path of the
Expand Down Expand Up @@ -1298,12 +1324,29 @@ unsafe extern "C" fn xde_mc_tx(
match res {
Ok(ProcessResult::Modified) => {
if xde_ext_ip_hack == 1 {
opte::engine::dbg(format!("[Tx] ext_ip_hack, bypass encap"));
// TODO need to special-case guest-loopback here as
// well if we want intra-guest comms to work when the
// ext_ip_hack is enabled.
mch.tx_drop_on_no_desc(pkt, hint, MacTxFlags::empty());
return ptr::null_mut();
use opte::oxide_vpc::engine::router::RouterTargetInternal;
// XXX This entry should always exist, but if it
// doesn't we fallback to the IG.
let tgt = meta
.get::<RouterTargetInternal>()
.unwrap_or(&RouterTargetInternal::InternetGateway);
opte::engine::dbg(format!("[Tx] ext_ip_hack: {:?}", tgt));
match tgt {
RouterTargetInternal::InternetGateway => {
mch.tx_drop_on_no_desc(pkt, hint, MacTxFlags::empty());
return ptr::null_mut();
}

// XXX This assumes VpcSubnet, an IP router target
// will not work without our encap network.
_ => {
return guest_loopback(
src_dev,
pkt,
Vni::new(7777u32).unwrap(),
);
}
}
}

// If the outer IPv6 destination is the same as the
Expand Down

0 comments on commit 92a3ae2

Please sign in to comment.