diff --git a/dtrace/opte-port-process.d b/dtrace/opte-port-process.d index ebdd3688..b2b06f0f 100644 --- a/dtrace/opte-port-process.d +++ b/dtrace/opte-port-process.d @@ -5,8 +5,8 @@ */ #include "common.h" -#define HDR_FMT "%-12s %-3s %-8s %-43s %-18s %s\n" -#define LINE_FMT "%-12s %-3s %-8u %-43s 0x%-16p %s\n" +#define HDR_FMT "%-12s %-3s %-8s %-43s %-5s %s\n" +#define LINE_FMT "%-12s %-3s %-8u %-43s %-5u %s\n" BEGIN { /* @@ -18,7 +18,7 @@ BEGIN { protos[17] = "UDP"; protos[255] = "XXX"; - printf(HDR_FMT, "NAME", "DIR", "EPOCH", "FLOW", "MBLK", "RESULT"); + printf(HDR_FMT, "NAME", "DIR", "EPOCH", "FLOW", "LEN", "RESULT"); num = 0; } @@ -27,11 +27,11 @@ port-process-return { this->name = stringof(arg1); this->flow = (flow_id_sdt_arg_t *)arg2; this->epoch = arg3; - this->mp = arg4; + this->mp = (mblk_t *)arg4; this->res = stringof(arg5); if (num >= 10) { - printf(HDR_FMT, "NAME", "DIR", "EPOCH", "FLOW", "MBLK", + printf(HDR_FMT, "NAME", "DIR", "EPOCH", "FLOW", "LEN", "RESULT"); num = 0; } @@ -45,15 +45,15 @@ port-process-return { port-process-return /this->af == AF_INET/ { FLOW_FMT(this->s, this->flow); - printf(LINE_FMT, this->name, this->dir, this->epoch, this->s, this->mp, - this->res); + printf(LINE_FMT, this->name, this->dir, this->epoch, this->s, + msgsize(this->mp), this->res); num++; } port-process-return /this->af == AF_INET6/ { FLOW_FMT6(this->s, this->flow); - printf(LINE_FMT, this->name, this->dir, this->epoch, this->s, this->mp, - this->res); + printf(LINE_FMT, this->name, this->dir, this->epoch, this->s, + msgsize(this->mp), this->res); num++; } diff --git a/opte/process-flow.md b/opte/process-flow.md new file mode 100644 index 00000000..54632532 --- /dev/null +++ b/opte/process-flow.md @@ -0,0 +1,30 @@ +The size of the TCP flow table is currently 8096. + +```mermaid +flowchart TD + process_in([process_in]) --> is_def_id{flow_id == FLOW_ID_DEFAULT?}; + is_def_id -- Yes --> lp[layers_process]; + is_def_id -- No --> check_uft{UFT entry?}; + check_uft -- Yes --> same_epoch{entry.epoch == port.epoch?}; + check_uft -- No --> lp; + same_epoch -- Yes --> run_ht[run HT]; + same_epoch -- No --> inv[invalidate UFT entry]; + inv --> lp; + run_ht --> is_tcp_uft{TCP?}; + is_tcp_uft -- Yes --> pite[process_in_tcp_existing]; + is_tcp_uft -- No --> rm([return Modified]); + lp --> lr{Layer Result?}; + lr -- Allow --> uft_add[add UFT entry]; + lr -- Deny --> rd([return Drop]); + lr -- "Hairpin(hp)" --> rhp(["return Hairpin(hp)"]); + lr -- "Err(e)" --> re(["return Err(e)"]); + pitn -- "Ok(TcpState::Closed)" --> rd; + pitn -- "Ok(tcp_state)" --> rm; + pitn -- "Err(e)" --> re; + pite -- "Ok(TcpState::Closed)" --> rd; + pite -- "Ok(tcp_state)" --> rm; + pite -- "Err(e)" --> re; + uft_add --> is_tcp_no_uft{TCP?}; + is_tcp_no_uft -- Yes --> pitn[process_in_tcp_new]; + is_tcp_no_uft -- No --> rm; +``` diff --git a/opte/src/engine/int_test.rs b/opte/src/engine/int_test.rs index 1c160bb4..9b1439c6 100644 --- a/opte/src/engine/int_test.rs +++ b/opte/src/engine/int_test.rs @@ -107,6 +107,7 @@ fn lab_cfg() -> PortCfg { snat: Some(SNatCfg { public_ip: "76.76.21.21".parse().unwrap(), ports: 1025..=4096, + phys_gw_mac: MacAddr::from([0x78, 0x23, 0xae, 0x5d, 0x4f, 0x0d]), }), gw_mac: MacAddr::from([0xAA, 0x00, 0x04, 0x00, 0xFF, 0x01]), gw_ip: "172.20.14.1".parse().unwrap(), @@ -179,8 +180,9 @@ fn g1_cfg() -> PortCfg { // which the oxide Rack is simply a part of. public_ip: "10.77.77.13".parse().unwrap(), ports: 1025..=4096, + phys_gw_mac: MacAddr::from([0x78, 0x23, 0xae, 0x5d, 0x4f, 0x0d]), }), - gw_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xF7, 0x00, 0x1]), + gw_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xFF, 0x77, 0x77]), gw_ip: "192.168.77.1".parse().unwrap(), vni: Vni::new(99u32).unwrap(), // Site 0xF7, Rack 1, Sled 1, Interface 1 @@ -202,7 +204,7 @@ fn g1_cfg() -> PortCfg { fn g2_cfg() -> PortCfg { PortCfg { private_ip: "192.168.77.102".parse().unwrap(), - private_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xF7, 0x00, 0x66]), + private_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xF0, 0x00, 0x66]), vpc_subnet: "192.168.77.0/24".parse().unwrap(), snat: Some(SNatCfg { // NOTE: This is not a routable IP, but remember that a @@ -211,8 +213,9 @@ fn g2_cfg() -> PortCfg { // which the oxide Rack is simply a part of. public_ip: "10.77.77.23".parse().unwrap(), ports: 4097..=8192, + phys_gw_mac: MacAddr::from([0x78, 0x23, 0xae, 0x5d, 0x4f, 0x0d]), }), - gw_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xF7, 0x00, 0x1]), + gw_mac: MacAddr::from([0xA8, 0x40, 0x25, 0xFF, 0x77, 0x77]), gw_ip: "192.168.77.1".parse().unwrap(), vni: Vni::new(99u32).unwrap(), // Site 0xF7, Rack 1, Sled 22, Interface 1 diff --git a/opte/src/engine/nat.rs b/opte/src/engine/nat.rs index 5717be59..372c8f88 100644 --- a/opte/src/engine/nat.rs +++ b/opte/src/engine/nat.rs @@ -31,11 +31,20 @@ use crate::api::{Direction, Ipv4Addr, MacAddr}; pub struct Nat4 { priv_ip: Ipv4Addr, public_ip: Ipv4Addr, + phys_gw_mac: MacAddr, } impl Nat4 { - pub fn new(priv_ip: Ipv4Addr, public_ip: Ipv4Addr) -> Self { - Self { priv_ip: priv_ip.into(), public_ip: public_ip.into() } + pub fn new( + priv_ip: Ipv4Addr, + public_ip: Ipv4Addr, + phys_gw_mac: MacAddr, + ) -> Self { + Self { + priv_ip: priv_ip.into(), + public_ip: public_ip.into(), + phys_gw_mac, + } } } @@ -51,15 +60,14 @@ impl StatefulAction for Nat4 { _flow_id: &InnerFlowId, meta: &mut Meta, ) -> rule::GenDescResult { - let mac_addr = meta.get::(); let desc = Nat4Desc { priv_ip: self.priv_ip, public_ip: self.public_ip, - // XXX-EXT-IP This is assuming ext_ip_hack and will only - // allow for inbound connections, this will not work for - // outbound. If we want that we'll want to actually query - // the native router/ARP table. - src_mac: mac_addr.cloned(), + // XXX-EXT-IP This is assuming ext_ip_hack. All packets + // outbound for IG will have their dest mac rewritten to + // go to physical gateway, which will then properly route + // the destination IP. + phys_gw_mac: self.phys_gw_mac.clone(), }; Ok(AllowOrDeny::Allow(Arc::new(desc))) } @@ -77,7 +85,7 @@ pub struct Nat4Desc { priv_ip: Ipv4Addr, public_ip: Ipv4Addr, // XXX-EXT-IP - src_mac: Option, + phys_gw_mac: MacAddr, } pub const NAT4_NAME: &'static str = "NAT4"; @@ -97,11 +105,10 @@ impl ActionDesc for Nat4Desc { }; // XXX-EXT-IP hack to rewrite destination MAC adress - // from virtual gateway addr to actual address that - // initiated connection. - if self.src_mac.is_some() { - ht.inner_ether = EtherMeta::modify(None, self.src_mac); - } + // from virtual gateway addr to the real gateway addr + // on the same subnet as the external IP. + ht.inner_ether = + EtherMeta::modify(None, Some(self.phys_gw_mac)); ht } @@ -131,14 +138,15 @@ mod test { use crate::engine::packet::{MetaGroup, PacketMeta}; use crate::engine::tcp::TcpMeta; - let priv_mac = MacAddr::from([0x02, 0x08, 0x20, 0xd8, 0x35, 0xcf]); - let dest_mac = MacAddr::from([0x78, 0x23, 0xae, 0x5d, 0x4f, 0x0d]); + let priv_mac = MacAddr::from([0xA8, 0x40, 0x25, 0xF0, 0x00, 0x01]); + let dest_mac = MacAddr::from([0xA8, 0x40, 0x25, 0xFF, 0x77, 0x77]); let priv_ip = "10.0.0.220".parse().unwrap(); let priv_port = "4999".parse().unwrap(); let pub_ip = "52.10.128.69".parse().unwrap(); let outside_ip = "76.76.21.21".parse().unwrap(); let outside_port = 80; - let nat = Nat4::new(priv_ip, pub_ip); + let gw_mac = MacAddr::from([0x78, 0x23, 0xae, 0x5d, 0x4f, 0x0d]); + let nat = Nat4::new(priv_ip, pub_ip, gw_mac); let mut port_meta = Meta::new(); // ================================================================ @@ -189,7 +197,7 @@ mod test { let ether_meta = pmo.inner.ether.as_ref().unwrap(); assert_eq!(ether_meta.src, priv_mac); - assert_eq!(ether_meta.dst, dest_mac); + assert_eq!(ether_meta.dst, gw_mac); let ip4_meta = match pmo.inner.ip.as_ref().unwrap() { IpMeta::Ip4(v) => v, diff --git a/opte/src/oxide_vpc/api.rs b/opte/src/oxide_vpc/api.rs index 2ff9f376..2ad750a1 100644 --- a/opte/src/oxide_vpc/api.rs +++ b/opte/src/oxide_vpc/api.rs @@ -124,6 +124,7 @@ pub struct CreateXdeReq { pub struct SNatCfg { pub public_ip: Ipv4Addr, pub ports: core::ops::RangeInclusive, + pub phys_gw_mac: MacAddr, } /// Xde delete ioctl parameter data. diff --git a/opte/src/oxide_vpc/engine/nat4.rs b/opte/src/oxide_vpc/engine/nat4.rs index 0a816d1d..905bb5f3 100644 --- a/opte/src/oxide_vpc/engine/nat4.rs +++ b/opte/src/oxide_vpc/engine/nat4.rs @@ -35,7 +35,11 @@ pub fn setup( // XXX-EXT-IP This config should not some from SNAT. This is // currently a hack assuming its use is in service of the // ext_ip_hack flag. - let nat = Nat4::new(cfg.private_ip, cfg.snat.as_ref().unwrap().public_ip); + let nat = Nat4::new( + cfg.private_ip, + cfg.snat.as_ref().unwrap().public_ip, + cfg.snat.as_ref().unwrap().phys_gw_mac, + ); let layer = Layer::new( NAT4_LAYER_NAME, pb.name(), diff --git a/opteadm/src/main.rs b/opteadm/src/main.rs index 70848042..ea81ba87 100644 --- a/opteadm/src/main.rs +++ b/opteadm/src/main.rs @@ -143,6 +143,9 @@ enum Command { #[structopt(long)] snat_end: Option, + #[structopt(long)] + snat_gw_mac: Option, + #[structopt(long)] passthrough: bool, }, @@ -498,6 +501,7 @@ fn main() { snat_ip, snat_start, snat_end, + snat_gw_mac, passthrough, } => { let hdl = opteadm::OpteAdm::open(OpteAdm::DLD_CTL).unwrap_or_die(); @@ -508,6 +512,7 @@ fn main() { snat_start.unwrap(), snat_end.unwrap(), ), + phys_gw_mac: snat_gw_mac.unwrap(), }), None => None, diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 85d1883d..a370be75 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -1146,7 +1146,33 @@ fn guest_loopback( let devs = unsafe { xde_devs.read() }; let ether_dst = pkt.headers().inner.ether.dst(); - match devs.iter().find(|x| x.vni == vni && x.port.mac_addr() == ether_dst) { + let maybe_dest_dev = if unsafe { xde_ext_ip_hack == 1 } { + let ip = pkt.headers().inner.ip.as_ref(); + match ip { + None => None, + Some(ip_hdr) => { + // XXX Doing all these shenanigans because we don't + // have the overlay layer in place which would + // normally rewrite the dst MAC addr to that of the + // dest guest. + // + // XXX Sigh, this still needs more work because we + // really do need to rewirte the dst mac, but I'm in a + // rush trying to get things working for a demo and I + // have too many yaks in my office. Come back to this + // later. + if let Some(ip4) = ip_hdr.ip4() { + devs.iter().find(|x| x.port_cfg.private_ip == ip4.dst()) + } else { + None + } + } + } + } else { + devs.iter().find(|x| x.vni == vni && x.port.mac_addr() == ether_dst) + }; + + match maybe_dest_dev { Some(dest_dev) => { // We have found a matching Port on this host; "loop back" // the packet into the inbound processing path of the @@ -1298,12 +1324,29 @@ unsafe extern "C" fn xde_mc_tx( match res { Ok(ProcessResult::Modified) => { if xde_ext_ip_hack == 1 { - opte::engine::dbg(format!("[Tx] ext_ip_hack, bypass encap")); - // TODO need to special-case guest-loopback here as - // well if we want intra-guest comms to work when the - // ext_ip_hack is enabled. - mch.tx_drop_on_no_desc(pkt, hint, MacTxFlags::empty()); - return ptr::null_mut(); + use opte::oxide_vpc::engine::router::RouterTargetInternal; + // XXX This entry should always exist, but if it + // doesn't we fallback to the IG. + let tgt = meta + .get::() + .unwrap_or(&RouterTargetInternal::InternetGateway); + opte::engine::dbg(format!("[Tx] ext_ip_hack: {:?}", tgt)); + match tgt { + RouterTargetInternal::InternetGateway => { + mch.tx_drop_on_no_desc(pkt, hint, MacTxFlags::empty()); + return ptr::null_mut(); + } + + // XXX This assumes VpcSubnet, an IP router target + // will not work without our encap network. + _ => { + return guest_loopback( + src_dev, + pkt, + Vni::new(7777u32).unwrap(), + ); + } + } } // If the outer IPv6 destination is the same as the