From 6f1db5ee1fa46f867893124d6cd41604a0c561d7 Mon Sep 17 00:00:00 2001 From: /gray Date: Fri, 1 Mar 2024 18:27:02 +0800 Subject: [PATCH] feat(bpf): implement stack bypass (#458) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sumire (菫) <151038614+sumire88@users.noreply.github.com> --- .github/workflows/kernel-test.yml | 108 +++++++++-- cmd/run.go | 9 +- config/config.go | 9 +- control/anyfrom_pool.go | 7 +- control/bpf_utils.go | 11 ++ control/control_plane.go | 32 ++-- control/control_plane_core.go | 251 +++++++------------------ control/kern/tproxy.c | 294 ++++++++++++++---------------- control/netns_utils.go | 225 +++++++++++++++-------- control/udp.go | 20 +- docs/en/README.md | 1 - docs/en/how-it-works.md | 2 + docs/zh/README.md | 1 - example.dae | 5 - go.mod | 4 +- go.sum | 5 - 16 files changed, 476 insertions(+), 508 deletions(-) diff --git a/.github/workflows/kernel-test.yml b/.github/workflows/kernel-test.yml index ec4562e6b..93007fb9d 100644 --- a/.github/workflows/kernel-test.yml +++ b/.github/workflows/kernel-test.yml @@ -40,7 +40,7 @@ jobs: strategy: fail-fast: false matrix: - kernel: [ '5.10-v0.3', '5.15-v0.3', '6.3-main', 'bpf-next-20231030.012704' ] + kernel: [ '5.10-20240201.165956', '5.15-20240201.165956', '6.1-20240201.165956', 'bpf-next-20240204.012837' ] timeout-minutes: 10 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 @@ -52,7 +52,7 @@ jobs: path: dae - name: Provision LVH VMs - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: test-name: dae-test image-version: ${{ matrix.kernel }} @@ -66,7 +66,7 @@ jobs: apt install -y unzip - name: Setup network - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | @@ -77,7 +77,7 @@ jobs: docker run -td --name dae --privileged --network dae -v /host:/host -v /sys:/sys ubuntu:22.04 bash - name: Setup v2ray server - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | @@ -125,7 +125,7 @@ jobs: echo '{"v":"2","ps":"test","add":"v2ray","port":"23333","id":"b004539e-0d7b-7996-c378-fb040e42de70","aid":"0","net":"tcp","tls":"","type":"none","path":"","host":"v2ray"}' > vmess.json - name: Setup dae server - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | @@ -165,11 +165,11 @@ jobs: chmod 600 ./conf.dae nohup docker exec dae /host/dae/dae run -c /host/conf.dae &> dae.log & - sleep 10s + sleep 5s cat dae.log - name: Check WAN IPv4 TCP - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | @@ -180,7 +180,7 @@ jobs: cat /host/v2ray.access.log | grep -q 'accepted tcp:1.1.1.1:443' - name: Check WAN IPv4 UDP - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | @@ -191,7 +191,7 @@ jobs: cat /host/v2ray.access.log | grep -q 'accepted udp:1.1.1.1:53' - name: Check WAN IPv6 TCP - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | @@ -202,7 +202,41 @@ jobs: cat /host/dae.log | grep -F -- '-> [2606:4700:4700::1111]:443' - name: Check WAN IPv6 UDP - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 + with: + provision: 'false' + cmd: | + set -ex + + docker exec dae dig @2606:4700:4700::1111 one.one.one.one + cat /host/dae.log | grep -F -- '-> [2606:4700:4700::1111]:53' + + - name: Setup WAN UDP port conflict + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 + with: + provision: 'false' + cmd: | + set -ex + + docker restart -t0 dae v2ray + nohup docker exec v2ray v2ray -c /host/v2ray.json &> v2ray.log & + nohup docker exec dae /host/dae/dae run -c /host/conf.dae &> dae.log & + sleep 5s + nohup docker exec dae nc -lu 53 &> nc.log & + + - name: Check WAN IPv4 UDP with port conflict + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 + with: + provision: 'false' + cmd: | + set -ex + + docker exec dae dig @1.1.1.1 one.one.one.one + cat /host/dae.log | grep -F -- '-> 1.1.1.1:53' + cat /host/v2ray.access.log | grep -q 'accepted udp:1.1.1.1:53' + + - name: Check WAN IPv6 UDP with port conflict + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | @@ -212,7 +246,7 @@ jobs: cat /host/dae.log | grep -F -- '-> [2606:4700:4700::1111]:53' - name: Setup LAN - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | @@ -261,7 +295,7 @@ jobs: log_level: trace lan_interface: dae-veth-peer - wan_interface: auto + wan_interface: dae-veth-peer,eth0 allow_insecure: false auto_config_kernel_parameter: true } @@ -284,11 +318,11 @@ jobs: chmod 600 ./conf.dae nohup docker exec dae /host/dae/dae run -c /host/conf.dae &> dae.log & - sleep 10s + sleep 5s cat dae.log - name: Check LAN IPv4 TCP - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | @@ -299,7 +333,7 @@ jobs: cat /host/v2ray.access.log | grep -q 'accepted tcp:1.0.0.1:80' - name: Check LAN IPv4 UDP - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | @@ -310,7 +344,7 @@ jobs: cat /host/v2ray.access.log | grep -q 'accepted udp:8.8.4.4:53' - name: Check LAN IPv6 TCP - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | @@ -321,7 +355,47 @@ jobs: cat /host/dae.log | grep -F -- '-> [2606:4700:4700::1001]:80' - name: Check LAN IPv6 UDP - uses: cilium/little-vm-helper@908ab1ff8a596a03cd5221a1f8602dc44c3f906d # v0.0.12 + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 + with: + provision: 'false' + cmd: | + set -ex + + docker exec dae ip net e dae dig @2001:4860:4860::8844 one.one.one.one + cat /host/dae.log | grep -F -- '-> [2001:4860:4860::8844]:53' + + - name: Setup LAN UDP port conflict + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 + with: + provision: 'false' + cmd: | + set -ex + + docker restart -t0 dae v2ray + + docker exec dae rm -f /var/run/netns/dae + docker exec dae bash /host/lan.bash + docker exec dae sysctl net.ipv4.conf.dae-veth-peer.send_redirects=0 + docker exec dae sysctl net.ipv6.conf.dae-veth-peer.forwarding=1 + + nohup docker exec v2ray v2ray -c /host/v2ray.json &> v2ray.log & + nohup docker exec dae /host/dae/dae run -c /host/conf.dae &> dae.log & + sleep 5s + nohup docker exec dae nc -lu 53 &> nc.log & + + - name: Check LAN IPv4 UDP with port conflict + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 + with: + provision: 'false' + cmd: | + set -ex + + docker exec dae ip net e dae dig @8.8.4.4 one.one.one.one + cat /host/dae.log | grep -F -- '-> 8.8.4.4:53' + cat /host/v2ray.access.log | grep -q 'accepted udp:8.8.4.4:53' + + - name: Check LAN IPv6 UDP with port conflict + uses: cilium/little-vm-helper@9d758b756305e83718a51b792a5aeabd022a39ec # v0.0.16 with: provision: 'false' cmd: | diff --git a/cmd/run.go b/cmd/run.go index c3e24d963..65b86fe68 100644 --- a/cmd/run.go +++ b/cmd/run.go @@ -135,9 +135,12 @@ func Run(log *logrus.Logger, conf *config.Config, externGeoDataDirs []string) (e _ = os.WriteFile(PidFilePath, []byte(strconv.Itoa(os.Getpid())), 0644) } }() - if listener, err = c.ListenAndServe(readyChan, conf.Global.TproxyPort); err != nil { - log.Errorln("ListenAndServe:", err) - } + control.GetDaeNetns().With(func() error { + if listener, err = c.ListenAndServe(readyChan, conf.Global.TproxyPort); err != nil { + log.Errorln("ListenAndServe:", err) + } + return err + }) sigs <- nil }() reloading := false diff --git a/config/config.go b/config/config.go index 2322d2798..513560a05 100644 --- a/config/config.go +++ b/config/config.go @@ -35,10 +35,11 @@ type Global struct { DialMode string `mapstructure:"dial_mode" default:"domain"` DisableWaitingNetwork bool `mapstructure:"disable_waiting_network" default:"false"` AutoConfigKernelParameter bool `mapstructure:"auto_config_kernel_parameter" default:"false"` - AutoConfigFirewallRule bool `mapstructure:"auto_config_firewall_rule" default:"false"` - SniffingTimeout time.Duration `mapstructure:"sniffing_timeout" default:"100ms"` - TlsImplementation string `mapstructure:"tls_implementation" default:"tls"` - UtlsImitate string `mapstructure:"utls_imitate" default:"chrome_auto"` + // DEPRECATED: not used as of https://github.com/daeuniverse/dae/pull/458 + AutoConfigFirewallRule bool `mapstructure:"auto_config_firewall_rule" default:"false"` + SniffingTimeout time.Duration `mapstructure:"sniffing_timeout" default:"100ms"` + TlsImplementation string `mapstructure:"tls_implementation" default:"tls"` + UtlsImitate string `mapstructure:"utls_imitate" default:"chrome_auto"` } type Utls struct { diff --git a/control/anyfrom_pool.go b/control/anyfrom_pool.go index bfab07896..5fe2b3616 100644 --- a/control/anyfrom_pool.go +++ b/control/anyfrom_pool.go @@ -192,7 +192,12 @@ func (p *AnyfromPool) GetOrCreate(lAddr string, ttl time.Duration) (conn *Anyfro }, KeepAlive: 0, } - pc, err := d.ListenPacket(context.Background(), "udp", lAddr) + var err error + var pc net.PacketConn + GetDaeNetns().With(func() error { + pc, err = d.ListenPacket(context.Background(), "udp", lAddr) + return nil + }) if err != nil { return nil, true, err } diff --git a/control/bpf_utils.go b/control/bpf_utils.go index 40d99f5a2..1ca61c020 100644 --- a/control/bpf_utils.go +++ b/control/bpf_utils.go @@ -217,13 +217,24 @@ func fullLoadBpfObjects( opts *loadBpfOptions, ) (err error) { retryLoadBpf: + netnsID, err := GetDaeNetns().NetnsID() + if err != nil { + return fmt.Errorf("failed to get netns id: %w", err) + } constants := map[string]interface{}{ "PARAM": struct { tproxyPort uint32 controlPlanePid uint32 + dae0Ifindex uint32 + dae0NetnsId uint32 + dae0peerMac [6]byte + padding [2]byte }{ tproxyPort: uint32(opts.BigEndianTproxyPort), controlPlanePid: uint32(os.Getpid()), + dae0Ifindex: uint32(GetDaeNetns().Dae0().Attrs().Index), + dae0NetnsId: uint32(netnsID), + dae0peerMac: [6]byte(GetDaeNetns().Dae0Peer().Attrs().HardwareAddr), }, } if err = loadBpfObjectsWithConstants(bpf, opts.CollectionOptions, constants); err != nil { diff --git a/control/control_plane.go b/control/control_plane.go index 8f23f2faa..8f1fdf7a3 100644 --- a/control/control_plane.go +++ b/control/control_plane.go @@ -128,6 +128,15 @@ func NewControlPlane( if err = rlimit.RemoveMemlock(); err != nil { return nil, fmt.Errorf("rlimit.RemoveMemlock:%v", err) } + + InitDaeNetns(log) + if err = InitSysctlManager(log); err != nil { + return nil, err + } + + if err = GetDaeNetns().Setup(); err != nil { + return nil, fmt.Errorf("failed to setup dae netns: %w", err) + } pinPath := filepath.Join(consts.BpfPinRoot, consts.AppName) if err = os.MkdirAll(pinPath, 0755); err != nil && !os.IsExist(err) { if os.IsNotExist(err) { @@ -194,20 +203,6 @@ func NewControlPlane( } }() - if len(global.LanInterface) > 0 || len(global.WanInterface) > 0 { - if err = core.setupRoutingPolicy(); err != nil { - return nil, err - } - if global.AutoConfigFirewallRule { - if ok := core.addAcceptInputMark(); ok { - core.deferFuncs = append(core.deferFuncs, func() error { - core.delAcceptInputMark() - return nil - }) - } - } - } - /// Bind to links. Binding should be advance of dialerGroups to avoid un-routable old connection. // Bind to LAN if len(global.LanInterface) > 0 { @@ -232,6 +227,10 @@ func NewControlPlane( } } } + // Bind to dae0 and dae0peer + if err = core.bindDaens(); err != nil { + return nil, fmt.Errorf("bindDaens: %w", err) + } /// DialerGroups (outbounds). if global.AllowInsecure { @@ -471,11 +470,6 @@ func NewControlPlane( } go dnsUpstream.InitUpstreams() - InitDaeNetns(log) - if err = InitSysctlManager(log); err != nil { - return nil, err - } - close(plane.ready) return plane, nil } diff --git a/control/control_plane_core.go b/control/control_plane_core.go index bafe75c9e..0c2176f94 100644 --- a/control/control_plane_core.go +++ b/control/control_plane_core.go @@ -9,12 +9,9 @@ import ( "context" "errors" "fmt" - "net" "net/netip" "os" - "os/exec" "regexp" - "strings" "sync" "github.com/cilium/ebpf" @@ -194,175 +191,6 @@ func (c *controlPlaneCore) delQdisc(ifname string) error { return nil } -// TODO: Support more than firewalld and fw4: need more user feedback. -var nftInputChains = [][3]string{ - {"inet", "firewalld", "filter_INPUT"}, - {"inet", "fw4", "input"}, -} - -func (c *controlPlaneCore) addAcceptInputMark() (ok bool) { - for _, rule := range nftInputChains { - if err := exec.Command("nft", "insert rule "+strings.Join(rule[:], " ")+" mark & "+consts.TproxyMarkString+" == "+consts.TproxyMarkString+" accept").Run(); err == nil { - ok = true - } - } - return ok -} - -func (c *controlPlaneCore) delAcceptInputMark() (ok bool) { - for _, rule := range nftInputChains { - output, err := exec.Command("nft", "--handle", "--numeric", "list", "chain", rule[0], rule[1], rule[2]).Output() - if err != nil { - continue - } - lines := strings.Split(string(output), "\n") - regex := regexp.MustCompile("meta mark & " + consts.TproxyMarkString + " == " + consts.TproxyMarkString + " accept # handle ([0-9]+)") - for _, line := range lines { - matches := regex.FindStringSubmatch(line) - if len(matches) >= 2 { - handle := matches[1] - if err = exec.Command("nft", "delete rule "+strings.Join(rule[:], " ")+" handle "+handle).Run(); err == nil { - ok = true - } - break - } - } - } - return ok -} - -func (c *controlPlaneCore) setupRoutingPolicy() (err error) { - /// Insert ip rule / ip route. - var table = 2023 + c.flip - - /** ip table - ip route add local default dev lo table 2023 - ip -6 route add local default dev lo table 2023 - */ - routes := []netlink.Route{{ - Scope: unix.RT_SCOPE_HOST, - LinkIndex: consts.LoopbackIfIndex, - Dst: &net.IPNet{ - IP: []byte{0, 0, 0, 0}, - Mask: net.CIDRMask(0, 32), - }, - Table: table, - Type: unix.RTN_LOCAL, - }, { - Scope: unix.RT_SCOPE_HOST, - LinkIndex: consts.LoopbackIfIndex, - Dst: &net.IPNet{ - IP: []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - Mask: net.CIDRMask(0, 128), - }, - Table: table, - Type: unix.RTN_LOCAL, - }} - var routeBadIpv6 bool - cleanRoutes := func() error { - var errs error - for _, route := range routes { - if e := netlink.RouteDel(&route); e != nil { - if len(route.Dst.IP) == net.IPv6len && routeBadIpv6 { - // Not clean for bad ipv6. - continue - } - if errs != nil { - errs = fmt.Errorf("%w; %v", errs, e) - } else { - errs = e - } - } - } - if errs != nil { - return fmt.Errorf("IpRouteDel(lo): %w", errs) - } - return nil - } -tryRouteAddAgain: - for _, route := range routes { - if err = netlink.RouteAdd(&route); err != nil { - if os.IsExist(err) { - _ = cleanRoutes() - goto tryRouteAddAgain - } - if len(route.Dst.IP) == net.IPv6len { - // ipv6 - c.log.Warnln("IpRouteAdd: Bad IPv6 support. Perhaps your machine disabled IPv6.") - routeBadIpv6 = true - continue - } - return fmt.Errorf("IpRouteAdd: %w", err) - } - } - c.deferFuncs = append(c.deferFuncs, cleanRoutes) - - /** ip rule - ip rule add fwmark 0x8000000/0x8000000 table 2023 - ip -6 rule add fwmark 0x8000000/0x8000000 table 2023 - */ - rules := []netlink.Rule{{ - SuppressIfgroup: -1, - SuppressPrefixlen: -1, - Priority: -1, - Goto: -1, - Flow: -1, - Family: unix.AF_INET, - Table: table, - Mark: int(consts.TproxyMark), - Mask: int(consts.TproxyMark), - }, { - SuppressIfgroup: -1, - SuppressPrefixlen: -1, - Priority: -1, - Goto: -1, - Flow: -1, - Family: unix.AF_INET6, - Table: table, - Mark: int(consts.TproxyMark), - Mask: int(consts.TproxyMark), - }} - var ruleBadIpv6 bool - cleanRules := func() error { - var errs error - for _, rule := range rules { - if rule.Family == unix.AF_INET6 && ruleBadIpv6 { - // Not clean for bad ipv6. - continue - } - if e := netlink.RuleDel(&rule); e != nil { - if errs != nil { - errs = fmt.Errorf("%w; %v", errs, e) - } else { - errs = e - } - } - } - if errs != nil { - return fmt.Errorf("IpRuleDel: %w", errs) - } - return nil - } -tryRuleAddAgain: - for _, rule := range rules { - if err = netlink.RuleAdd(&rule); err != nil { - if os.IsExist(err) { - _ = cleanRules() - goto tryRuleAddAgain - } - if rule.Family == unix.AF_INET6 { - // ipv6 - c.log.Warnln("IpRuleAdd: Bad IPv6 support. Perhaps your machine disabled IPv6 (need CONFIG_IPV6_MULTIPLE_TABLES).") - ruleBadIpv6 = true - continue - } - return fmt.Errorf("IpRuleAdd: %w", err) - } - } - c.deferFuncs = append(c.deferFuncs, cleanRules) - return nil -} - func (c *controlPlaneCore) addLinkCb(_ifname string, rtmType uint16, cb func()) error { ch := make(chan netlink.LinkUpdate) done := make(chan struct{}) @@ -555,11 +383,6 @@ func (c *controlPlaneCore) setupSkPidMonitor() error { } func (c *controlPlaneCore) bindWan(ifname string, autoConfigKernelParameter bool) error { - if autoConfigKernelParameter { - if err := sysctl.Set(fmt.Sprintf("net.ipv4.conf.%v.accept_local", ifname), "1", false); err != nil { - return err - } - } return c._bindWan(ifname) } @@ -626,36 +449,84 @@ func (c *controlPlaneCore) _bindWan(ifname string) error { return nil }) - filterIngress := &netlink.BpfFilter{ + return nil +} + +func (c *controlPlaneCore) bindDaens() (err error) { + daens := GetDaeNetns() + + // tproxy_dae0peer_ingress@eth0 at dae netns + daens.With(func() error { + return c.addQdisc(daens.Dae0Peer().Attrs().Name) + }) + filterDae0peerIngress := &netlink.BpfFilter{ FilterAttrs: netlink.FilterAttrs{ - LinkIndex: link.Attrs().Index, + LinkIndex: daens.Dae0Peer().Attrs().Index, Parent: netlink.HANDLE_MIN_INGRESS, - Handle: netlink.MakeHandle(0x2023, 0b010+uint16(c.flip)), + Handle: netlink.MakeHandle(0x2022, 0b010+uint16(c.flip)), Protocol: unix.ETH_P_ALL, - Priority: 1, + Priority: 0, }, - Fd: c.bpf.bpfPrograms.TproxyWanIngress.FD(), - Name: consts.AppName + "_wan_ingress", + Fd: c.bpf.bpfPrograms.TproxyDae0peerIngress.FD(), + Name: consts.AppName + "_dae0peer_ingress", DirectAction: true, } - _ = netlink.FilterDel(filterIngress) + daens.With(func() error { + return netlink.FilterDel(filterDae0peerIngress) + }) // Remove and add. if !c.isReload { // Clean up thoroughly. - filterIngressFlipped := deepcopy.Copy(filterIngress).(*netlink.BpfFilter) + filterIngressFlipped := deepcopy.Copy(filterDae0peerIngress).(*netlink.BpfFilter) filterIngressFlipped.FilterAttrs.Handle ^= 1 - _ = netlink.FilterDel(filterIngressFlipped) + daens.With(func() error { + return netlink.FilterDel(filterDae0peerIngress) + }) } - if err := netlink.FilterAdd(filterIngress); err != nil { + if err = daens.With(func() error { + return netlink.FilterAdd(filterDae0peerIngress) + }); err != nil { return fmt.Errorf("cannot attach ebpf object to filter ingress: %w", err) } c.deferFuncs = append(c.deferFuncs, func() error { - if err := netlink.FilterDel(filterIngress); err != nil { - return fmt.Errorf("FilterDel(%v:%v): %w", ifname, filterIngress.Name, err) + daens.With(func() error { + return netlink.FilterDel(filterDae0peerIngress) + }) + return nil + }) + + // tproxy_dae0_ingress@dae0 at host netns + c.addQdisc(daens.Dae0().Attrs().Name) + filterDae0Ingress := &netlink.BpfFilter{ + FilterAttrs: netlink.FilterAttrs{ + LinkIndex: daens.Dae0().Attrs().Index, + Parent: netlink.HANDLE_MIN_INGRESS, + Handle: netlink.MakeHandle(0x2022, 0b010+uint16(c.flip)), + Protocol: unix.ETH_P_ALL, + Priority: 0, + }, + Fd: c.bpf.bpfPrograms.TproxyDae0Ingress.FD(), + Name: consts.AppName + "_dae0_ingress", + DirectAction: true, + } + _ = netlink.FilterDel(filterDae0Ingress) + // Remove and add. + if !c.isReload { + // Clean up thoroughly. + filterEgressFlipped := deepcopy.Copy(filterDae0Ingress).(*netlink.BpfFilter) + filterEgressFlipped.FilterAttrs.Handle ^= 1 + _ = netlink.FilterDel(filterEgressFlipped) + } + if err := netlink.FilterAdd(filterDae0Ingress); err != nil { + return fmt.Errorf("cannot attach ebpf object to filter egress: %w", err) + } + c.deferFuncs = append(c.deferFuncs, func() error { + if err := netlink.FilterDel(filterDae0Ingress); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("FilterDel(%v:%v): %w", daens.Dae0().Attrs().Name, filterDae0Ingress.Name, err) } return nil }) - return nil + return } // BatchUpdateDomainRouting update bpf map domain_routing. Since one IP may have multiple domains, this function should diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c index 2474cbe66..488bc2764 100644 --- a/control/kern/tproxy.c +++ b/control/kern/tproxy.c @@ -39,6 +39,9 @@ #define IPV6_DST_OFF(link_h_len) (link_h_len + offsetof(struct ipv6hdr, daddr)) #define IPV6_SRC_OFF(link_h_len) (link_h_len + offsetof(struct ipv6hdr, saddr)) +#define PACKET_HOST 0 +#define PACKET_OTHERHOST 3 + #define NOWHERE_IFINDEX 0 #define LOOPBACK_IFINDEX 1 @@ -108,9 +111,6 @@ struct { __uint(max_entries, 2); } listen_socket_map SEC(".maps"); -/// TODO: Remove items from the dst_map by conntrack. -// Dest map: - union ip6 { __u8 u6_addr8[16]; __be16 u6_addr16[8]; @@ -118,6 +118,26 @@ union ip6 { __be64 u6_addr64[2]; }; +struct redirect_tuple { + union ip6 sip; + union ip6 dip; + __u8 l4proto; +}; + +struct redirect_entry { + __u32 ifindex; + __u8 smac[6]; + __u8 dmac[6]; + __u8 from_wan; +}; + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, struct redirect_tuple); + __type(value, struct redirect_entry); + __uint(max_entries, 65536); +} redirect_track SEC(".maps"); + struct ip_port { union ip6 ip; __be16 port; @@ -149,6 +169,10 @@ struct tuples { struct dae_param { __u32 tproxy_port; __u32 control_plane_pid; + __u32 dae0_ifindex; + __u32 dae_netns_id; + __u8 dae0peer_mac[6]; + __u8 padding[2]; }; static volatile const struct dae_param PARAM = {}; @@ -812,15 +836,14 @@ static __always_inline __u32 get_link_h_len(__u32 ifindex, } static __always_inline int -assign_socket_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, - __u32 len, bool established) { +lookup_and_assign_tcp_established(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, __u32 len) +{ int ret = -1; struct bpf_sock *sk = bpf_skc_lookup_tcp(skb, tuple, len, BPF_F_CURRENT_NETNS, 0); if (!sk) return -1; - if (established && - (sk->state == BPF_TCP_LISTEN || sk->state == BPF_TCP_TIME_WAIT)) { + if (sk->state == BPF_TCP_LISTEN || sk->state == BPF_TCP_TIME_WAIT) { goto release; } @@ -831,9 +854,14 @@ assign_socket_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, } static __always_inline int -assign_socket_udp(struct __sk_buff *skb, - struct bpf_sock_tuple *tuple, __u32 len) { - struct bpf_sock *sk = bpf_sk_lookup_udp(skb, tuple, len, BPF_F_CURRENT_NETNS, 0); +assign_listener(struct __sk_buff *skb, __u8 l4proto) +{ + struct bpf_sock *sk; + if (l4proto == IPPROTO_TCP) + sk = bpf_map_lookup_elem(&listen_socket_map, &zero_key); + else + sk = bpf_map_lookup_elem(&listen_socket_map, &one_key); + if (!sk) return -1; @@ -843,11 +871,39 @@ assign_socket_udp(struct __sk_buff *skb, } static __always_inline int -assign_socket(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, - __u32 len, __u8 nexthdr, bool established) { - if (nexthdr == IPPROTO_TCP) - return assign_socket_tcp(skb, tuple, len, established); - return assign_socket_udp(skb, tuple, len); +redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len, + struct tuples *tuples, __u8 l4proto, + struct ethhdr *ethh, __u8 from_wan) { + + /* Redirect from L3 dev to L2 dev, e.g. wg0 -> veth */ + if (!link_h_len) { + __u16 l3proto = skb->protocol; + bpf_skb_change_head(skb, sizeof(struct ethhdr), 0); + bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_proto), + &l3proto, sizeof(l3proto), 0); + } + + bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_dest), + (void *)&PARAM.dae0peer_mac, sizeof(ethh->h_dest), 0); + + struct redirect_tuple redirect_tuple = {}; + if (skb->protocol == bpf_htons(ETH_P_IP)) { + redirect_tuple.sip.u6_addr32[3] = tuples->five.sip.u6_addr32[3]; + redirect_tuple.dip.u6_addr32[3] = tuples->five.dip.u6_addr32[3]; + } else { + __builtin_memcpy(&redirect_tuple.sip, &tuples->five.sip, IPV6_BYTE_LENGTH); + __builtin_memcpy(&redirect_tuple.dip, &tuples->five.dip, IPV6_BYTE_LENGTH); + } + redirect_tuple.l4proto = l4proto; + struct redirect_entry redirect_entry = {}; + redirect_entry.ifindex = skb->ifindex; + redirect_entry.from_wan = from_wan; + __builtin_memcpy(redirect_entry.smac, ethh->h_source, sizeof(ethh->h_source)); + __builtin_memcpy(redirect_entry.dmac, ethh->h_dest, sizeof(ethh->h_dest)); + bpf_map_update_elem(&redirect_track, &redirect_tuple, &redirect_entry, BPF_ANY); + + skb->cb[0] = TPROXY_MARK; + return bpf_redirect(PARAM.dae0_ifindex, 0); } SEC("tc/ingress") @@ -893,7 +949,6 @@ int tproxy_lan_ingress(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {0}; __u32 tuple_size; struct bpf_sock *sk; - bool is_old_conn = false; __u32 flag[8]; void *l4hdr; @@ -917,11 +972,11 @@ int tproxy_lan_ingress(struct __sk_buff *skb) { goto new_connection; } - sk = bpf_skc_lookup_tcp(skb, &tuple, tuple_size, BPF_F_CURRENT_NETNS, 0); + sk = bpf_skc_lookup_tcp(skb, &tuple, tuple_size, PARAM.dae_netns_id, 0); if (sk) { if (sk->state != BPF_TCP_LISTEN) { - is_old_conn = true; - goto assign; + bpf_sk_release(sk); + goto control_plane; } bpf_sk_release(sk); } @@ -1015,45 +1070,8 @@ int tproxy_lan_ingress(struct __sk_buff *skb) { } // Assign to control plane. - - if (l4proto == IPPROTO_TCP) { - // TCP. - sk = bpf_map_lookup_elem(&listen_socket_map, &zero_key); - if (!sk || sk->state != BPF_TCP_LISTEN) { - bpf_printk("accpet tcp tproxy not listen"); - goto sk_accept; - } - } else { - // UDP. - - sk = bpf_map_lookup_elem(&listen_socket_map, &one_key); - if (!sk) { - bpf_printk("accpet udp tproxy not listen"); - goto sk_accept; - } - } - -assign: - skb->mark = TPROXY_MARK; - ret = bpf_sk_assign(skb, sk, 0); - bpf_sk_release(sk); - if (ret) { - if (is_old_conn && ret == -ESOCKTNOSUPPORT) { - bpf_printk("bpf_sk_assign: %d, perhaps you have other TPROXY programs " - "(such as v2ray) running?", - ret); - return TC_ACT_OK; - } else { - bpf_printk("bpf_sk_assign: %d", ret); - } - return TC_ACT_SHOT; - } - return TC_ACT_OK; - -sk_accept: - if (sk) { - bpf_sk_release(sk); - } +control_plane: + return redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 0); direct: return TC_ACT_OK; @@ -1112,9 +1130,6 @@ static __always_inline bool pid_is_control_plane(struct __sk_buff *skb, } } -__u8 special_mac_to_tproxy[6] = {2, 0, 2, 3, 0, 0}; -__u8 special_mac_from_tproxy[6] = {2, 0, 2, 3, 0, 1}; - // Routing and redirect the packet back. // We cannot modify the dest address here. So we cooperate with wan_ingress. SEC("tc/wan_egress") @@ -1273,18 +1288,6 @@ int tproxy_wan_egress(struct __sk_buff *skb) { &routing_result, BPF_ANY); } - // Write mac. - if ((ret = - bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_dest), - ethh.h_source, sizeof(ethh.h_source), 0))) { - return TC_ACT_SHOT; - } - if ((ret = bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_source), - special_mac_to_tproxy, - sizeof(ethh.h_source), 0))) { - return TC_ACT_SHOT; - }; - } else if (l4proto == IPPROTO_UDP) { // Routing. It decides if we redirect traffic to control plane. @@ -1364,39 +1367,13 @@ int tproxy_wan_egress(struct __sk_buff *skb) { return TC_ACT_SHOT; } - // Write mac. - if ((ret = - bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_dest), - ethh.h_source, sizeof(ethh.h_source), 0))) { - return TC_ACT_SHOT; - } - if ((ret = bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_source), - special_mac_to_tproxy, - sizeof(ethh.h_source), 0))) { - return TC_ACT_SHOT; - }; } - // // Print packet in hex for debugging (checksum or something else). - // if ((l4proto == IPPROTO_TCP ? tcph.dest : udph.dest) == bpf_htons(8443)) { - // bpf_printk("PRINT OUTPUT PACKET"); - // for (__u32 i = 0; i < skb->len && i < 500; i++) { - // __u8 t = 0; - // bpf_skb_load_bytes(skb, i, &t, 1); - // bpf_printk("%02x", t); - // } - // } - - // Redirect from egress to ingress. - if ((ret = bpf_redirect(skb->ifindex, BPF_F_INGRESS)) == TC_ACT_SHOT) { - bpf_printk("Shot bpf_redirect: %d", ret); - return TC_ACT_SHOT; - } - return TC_ACT_REDIRECT; + return redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 1); } -SEC("tc/wan_ingress") -int tproxy_wan_ingress(struct __sk_buff *skb) { +SEC("tc/dae0peer_ingress") +int tproxy_dae0peer_ingress(struct __sk_buff *skb) { struct ethhdr ethh; struct iphdr iph; struct ipv6hdr ipv6h; @@ -1405,10 +1382,12 @@ int tproxy_wan_ingress(struct __sk_buff *skb) { struct udphdr udph; __u8 ihl; __u8 l4proto; - __u32 link_h_len; - if (get_link_h_len(skb->ifindex, &link_h_len)) { - return TC_ACT_OK; + __u32 link_h_len = 14; + + if (skb->cb[0] != TPROXY_MARK) { + return TC_ACT_SHOT; } + int ret = parse_transport(skb, link_h_len, ðh, &iph, &ipv6h, &icmp6h, &tcph, &udph, &ihl, &l4proto); if (ret) { @@ -1420,93 +1399,84 @@ int tproxy_wan_ingress(struct __sk_buff *skb) { struct tuples tuples; get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto); - // bpf_printk("bpf_ntohs(*(__u16 *)ðh.h_source[4]): %u", - // bpf_ntohs(*(__u16 *)ðh.h_source[4])); - // Tproxy related. - __u16 tproxy_typ = bpf_ntohs(*(__u16 *)ðh.h_source[4]); - if (*(__u32 *)ðh.h_source[0] != bpf_htonl(0x02000203) || tproxy_typ > 1) { - // Check for security. Reject packets that is UDP and sent to tproxy port. - __be16 tproxy_port = PARAM.tproxy_port; - if (!tproxy_port) { - goto accept; - } - if (unlikely(tproxy_port == tuples.five.dport)) { - struct bpf_sock_tuple tuple = {0}; - __u32 tuple_size; - - if (skb->protocol == bpf_htons(ETH_P_IP)) { - tuple.ipv4.daddr = tuples.five.dip.u6_addr32[3]; - tuple.ipv4.dport = tuples.five.dport; - tuple_size = sizeof(tuple.ipv4); - } else { - __builtin_memcpy(tuple.ipv6.daddr, &tuples.five.dip, IPV6_BYTE_LENGTH); - tuple.ipv6.dport = tuples.five.dport; - tuple_size = sizeof(tuple.ipv6); - } - - struct bpf_sock *sk = - bpf_sk_lookup_udp(skb, &tuple, tuple_size, BPF_F_CURRENT_NETNS, 0); - if (sk) { - // Scope is host. - bpf_sk_release(sk); - return TC_ACT_SHOT; - } - } - accept: - return TC_ACT_PIPE; - } - - // Should send the packet to tproxy. skb->mark = TPROXY_MARK; - struct bpf_sock_tuple tuple = {}; - __u32 tuple_size = skb->protocol == bpf_htons(ETH_P_IP) ? - sizeof(tuple.ipv4) : sizeof(tuple.ipv6); + bpf_skb_change_type(skb, PACKET_HOST); /* First look for established socket. * This is done for TCP only, otherwise bpf_sk_lookup_udp would find * previously created transparent socket for UDP, which is not what we want. * */ if (l4proto == IPPROTO_TCP) { + __u32 tuple_size; + struct bpf_sock_tuple tuple = {}; + if (skb->protocol == bpf_htons(ETH_P_IP)) { tuple.ipv4.saddr = tuples.five.sip.u6_addr32[3]; - tuple.ipv4.sport = tuples.five.sport; tuple.ipv4.daddr = tuples.five.dip.u6_addr32[3]; + tuple.ipv4.sport = tuples.five.sport; tuple.ipv4.dport = tuples.five.dport; + tuple_size = sizeof(tuple.ipv4); } else { __builtin_memcpy(tuple.ipv6.saddr, &tuples.five.sip, IPV6_BYTE_LENGTH); __builtin_memcpy(tuple.ipv6.daddr, &tuples.five.dip, IPV6_BYTE_LENGTH); tuple.ipv6.sport = tuples.five.sport; tuple.ipv6.dport = tuples.five.dport; + tuple_size = sizeof(tuple.ipv6); } - ret = assign_socket(skb, &tuple, tuple_size, l4proto, true); - if (ret == 0) { + if (lookup_and_assign_tcp_established(skb, &tuple, tuple_size) == 0) { return TC_ACT_OK; } } /* Then look for tproxy listening socket */ - __be16 tproxy_port = PARAM.tproxy_port; - if (!tproxy_port) { + if (assign_listener(skb, l4proto) == 0) { + return TC_ACT_OK; + } + + return TC_ACT_SHOT; +} + +SEC("tc/dae0_ingress") +int tproxy_dae0_ingress(struct __sk_buff *skb) { + struct ethhdr ethh; + struct iphdr iph; + struct ipv6hdr ipv6h; + struct icmp6hdr icmp6h; + struct tcphdr tcph; + struct udphdr udph; + __u8 ihl; + __u8 l4proto; + __u32 link_h_len = 14; + if (parse_transport(skb, link_h_len, ðh, &iph, &ipv6h, &icmp6h, + &tcph, &udph, &ihl, &l4proto)) { return TC_ACT_OK; } + struct tuples tuples; + get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto); + + // reverse the tuple! + struct redirect_tuple redirect_tuple = {}; if (skb->protocol == bpf_htons(ETH_P_IP)) { - tuple.ipv4.saddr = 0; - tuple.ipv4.daddr = tuples.five.sip.u6_addr32[3]; - tuple.ipv4.sport = 0; - tuple.ipv4.dport = tproxy_port; + redirect_tuple.sip.u6_addr32[3] = tuples.five.dip.u6_addr32[3]; + redirect_tuple.dip.u6_addr32[3] = tuples.five.sip.u6_addr32[3]; } else { - __builtin_memset(tuple.ipv6.saddr, 0, IPV6_BYTE_LENGTH); - __builtin_memcpy(tuple.ipv6.daddr, &tuples.five.sip, IPV6_BYTE_LENGTH); - tuple.ipv6.sport = 0; - tuple.ipv6.dport = tproxy_port; + __builtin_memcpy(&redirect_tuple.sip, &tuples.five.dip, IPV6_BYTE_LENGTH); + __builtin_memcpy(&redirect_tuple.dip, &tuples.five.sip, IPV6_BYTE_LENGTH); } - ret = assign_socket(skb, &tuple, tuple_size, l4proto, false); - if (ret == 0) { + redirect_tuple.l4proto = l4proto; + struct redirect_entry *redirect_entry = bpf_map_lookup_elem(&redirect_track, &redirect_tuple); + if (!redirect_entry) return TC_ACT_OK; - } - return TC_ACT_SHOT; + bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_source), + redirect_entry->dmac, sizeof(redirect_entry->dmac), 0); + bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_dest), + redirect_entry->smac, sizeof(redirect_entry->smac), 0); + __u32 type = redirect_entry->from_wan ? PACKET_HOST : PACKET_OTHERHOST; + bpf_skb_change_type(skb, type); + __u64 flags = redirect_entry->from_wan ? BPF_F_INGRESS : 0; + return bpf_redirect(redirect_entry->ifindex, flags); } static int __always_inline _update_map_elem_by_cookie(const __u64 cookie) { diff --git a/control/netns_utils.go b/control/netns_utils.go index 3954f94fe..a32c983af 100644 --- a/control/netns_utils.go +++ b/control/netns_utils.go @@ -1,7 +1,6 @@ package control import ( - "bytes" "fmt" "net" "os" @@ -10,6 +9,7 @@ import ( "sync" "sync/atomic" + "github.com/daeuniverse/dae/common/consts" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" @@ -48,6 +48,18 @@ func GetDaeNetns() *DaeNetns { return daeNetns } +func (ns *DaeNetns) NetnsID() (int, error) { + return netlink.GetNetNsIdByFd(int(ns.daeNs)) +} + +func (ns *DaeNetns) Dae0() netlink.Link { + return ns.dae0 +} + +func (ns *DaeNetns) Dae0Peer() netlink.Link { + return ns.dae0peer +} + func (ns *DaeNetns) Setup() (err error) { if ns.setupDone.Load() { return @@ -104,10 +116,10 @@ func (ns *DaeNetns) setup() (err error) { if err = ns.setupVeth(); err != nil { return } - if err = ns.setupSysctl(); err != nil { + if err = ns.setupNetns(); err != nil { return } - if err = ns.setupNetns(); err != nil { + if err = ns.setupSysctl(); err != nil { return } if err = ns.setupIPv4Datapath(); err != nil { @@ -116,16 +128,100 @@ func (ns *DaeNetns) setup() (err error) { if err = ns.setupIPv6Datapath(); err != nil { return } - go ns.monitorDae0LinkAddr() + if err = ns.setupRoutingPolicy(); err != nil { + return + } return } +func (ns *DaeNetns) setupRoutingPolicy() (err error) { + if err = netns.Set(ns.daeNs); err != nil { + return fmt.Errorf("failed to switch to daens: %v", err) + } + defer netns.Set(ns.hostNs) + + /// Insert ip rule / ip route. + var table = 2023 + + /** ip table + ip route add local default dev lo table 2023 + ip -6 route add local default dev lo table 2023 + */ + routes := []netlink.Route{{ + Scope: unix.RT_SCOPE_HOST, + LinkIndex: consts.LoopbackIfIndex, + Dst: &net.IPNet{ + IP: []byte{0, 0, 0, 0}, + Mask: net.CIDRMask(0, 32), + }, + Table: table, + Type: unix.RTN_LOCAL, + }, { + Scope: unix.RT_SCOPE_HOST, + LinkIndex: consts.LoopbackIfIndex, + Dst: &net.IPNet{ + IP: []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + Mask: net.CIDRMask(0, 128), + }, + Table: table, + Type: unix.RTN_LOCAL, + }} + for _, route := range routes { + if err = netlink.RouteAdd(&route); err != nil { + if len(route.Dst.IP) == net.IPv6len { + // ipv6 + ns.log.Warnln("IpRouteAdd: Bad IPv6 support. Perhaps your machine disabled IPv6.") + continue + } + return fmt.Errorf("IpRouteAdd: %w", err) + } + } + + /** ip rule + ip rule add fwmark 0x8000000/0x8000000 table 2023 + ip -6 rule add fwmark 0x8000000/0x8000000 table 2023 + */ + rules := []netlink.Rule{{ + SuppressIfgroup: -1, + SuppressPrefixlen: -1, + Priority: -1, + Goto: -1, + Flow: -1, + Family: unix.AF_INET, + Table: table, + Mark: int(consts.TproxyMark), + Mask: int(consts.TproxyMark), + }, { + SuppressIfgroup: -1, + SuppressPrefixlen: -1, + Priority: -1, + Goto: -1, + Flow: -1, + Family: unix.AF_INET6, + Table: table, + Mark: int(consts.TproxyMark), + Mask: int(consts.TproxyMark), + }} + + for _, rule := range rules { + if err = netlink.RuleAdd(&rule); err != nil { + if rule.Family == unix.AF_INET6 { + // ipv6 + ns.log.Warnln("IpRuleAdd: Bad IPv6 support. Perhaps your machine disabled IPv6 (need CONFIG_IPV6_MULTIPLE_TABLES).") + continue + } + return fmt.Errorf("IpRuleAdd: %w", err) + } + } + return nil +} func (ns *DaeNetns) setupVeth() (err error) { // ip l a dae0 type veth peer name dae0peer DeleteLink(HostVethName) if err = netlink.LinkAdd(&netlink.Veth{ LinkAttrs: netlink.LinkAttrs{ - Name: HostVethName, + Name: HostVethName, + TxQLen: 1000, }, PeerName: NsVethName, }); err != nil { @@ -144,38 +240,6 @@ func (ns *DaeNetns) setupVeth() (err error) { return } -func (ns *DaeNetns) setupSysctl() (err error) { - // sysctl net.ipv4.conf.dae0.rp_filter=0 - if err = sysctl.Set(fmt.Sprintf("net.ipv4.conf.%s.rp_filter", HostVethName), "0", true); err != nil { - return fmt.Errorf("failed to set rp_filter for dae0: %v", err) - } - // sysctl net.ipv4.conf.all.rp_filter=0 - if err = sysctl.Set("net.ipv4.conf.all.rp_filter", "0", true); err != nil { - return fmt.Errorf("failed to set rp_filter for all: %v", err) - } - // sysctl net.ipv4.conf.dae0.arp_filter=0 - if err = sysctl.Set(fmt.Sprintf("net.ipv4.conf.%s.arp_filter", HostVethName), "0", true); err != nil { - return fmt.Errorf("failed to set arp_filter for dae0: %v", err) - } - // sysctl net.ipv4.conf.all.arp_filter=0 - if err = sysctl.Set("net.ipv4.conf.all.arp_filter", "0", true); err != nil { - return fmt.Errorf("failed to set arp_filter for all: %v", err) - } - // sysctl net.ipv4.conf.dae0.accept_local=1 - if err = sysctl.Set(fmt.Sprintf("net.ipv4.conf.%s.accept_local", HostVethName), "1", true); err != nil { - return fmt.Errorf("failed to set accept_local for dae0: %v", err) - } - // sysctl net.ipv6.conf.dae0.disable_ipv6=0 - if err = sysctl.Set(fmt.Sprintf("net.ipv6.conf.%s.disable_ipv6", HostVethName), "0", true); err != nil { - return fmt.Errorf("failed to set disable_ipv6 for dae0: %v", err) - } - // sysctl net.ipv6.conf.dae0.forwarding=1 - SetForwarding(HostVethName, "1") - // sysctl net.ipv6.conf.all.forwarding=1 - SetForwarding("all", "1") - return -} - func (ns *DaeNetns) setupNetns() (err error) { // ip netns a daens DeleteNamedNetns(NsName) @@ -191,19 +255,50 @@ func (ns *DaeNetns) setupNetns() (err error) { if err = netlink.LinkSetNsFd(ns.dae0peer, int(ns.daeNs)); err != nil { return fmt.Errorf("failed to move dae0peer to daens: %v", err) } - return -} -func (ns *DaeNetns) setupIPv4Datapath() (err error) { if err = netns.Set(ns.daeNs); err != nil { return fmt.Errorf("failed to switch to daens: %v", err) } defer netns.Set(ns.hostNs) - // (ip net e daens) ip l s dae0peer up if err = netlink.LinkSetUp(ns.dae0peer); err != nil { return fmt.Errorf("failed to set link dae0peer up: %v", err) } + // re-fetch dae0peer to make sure we have the latest mac address + if ns.dae0peer, err = netlink.LinkByName(NsVethName); err != nil { + return fmt.Errorf("failed to get link dae0peer: %v", err) + } + lo, err := netlink.LinkByName("lo") + if err != nil { + return fmt.Errorf("failed to get link lo: %v", err) + } + // (ip net e daens) ip l s lo up + if err = netlink.LinkSetUp(lo); err != nil { + return fmt.Errorf("failed to set link lo up: %v", err) + } + return +} + +func (ns *DaeNetns) setupSysctl() (err error) { + // sysctl net.ipv6.conf.dae0.disable_ipv6=0 + if err = sysctl.Set(fmt.Sprintf("net.ipv6.conf.%s.disable_ipv6", HostVethName), "0", true); err != nil { + return fmt.Errorf("failed to set disable_ipv6 for dae0: %v", err) + } + // sysctl net.ipv6.conf.dae0.forwarding=1 + if err = sysctl.Set(fmt.Sprintf("net.ipv6.conf.%s.forwarding", HostVethName), "1", true); err != nil { + return fmt.Errorf("failed to set forwarding for dae0: %v", err) + } + // sysctl net.ipv6.conf.all.forwarding=1 + SetForwarding("all", "1") + return +} + +func (ns *DaeNetns) setupIPv4Datapath() (err error) { + if err = netns.Set(ns.daeNs); err != nil { + return fmt.Errorf("failed to switch to daens: %v", err) + } + defer netns.Set(ns.hostNs) + // (ip net e daens) ip a a 169.254.0.11 dev dae0peer // Although transparent UDP socket doesn't use this IP, it's still needed to make proper L3 header ip, ipNet, err := net.ParseCIDR("169.254.0.11/32") @@ -233,6 +328,14 @@ func (ns *DaeNetns) setupIPv4Datapath() (err error) { return fmt.Errorf("failed to add v4 route2 to dae0peer: %v", err) } // (ip net e daens) ip n r 169.254.0.1 dev dae0peer lladdr $mac_dae0 nud permanent + if err = netlink.NeighSet(&netlink.Neigh{ + IP: net.ParseIP("169.254.0.1"), + HardwareAddr: ns.dae0.Attrs().HardwareAddr, + LinkIndex: ns.dae0peer.Attrs().Index, + State: netlink.NUD_PERMANENT, + }); err != nil { + return fmt.Errorf("failed to add neigh to dae0peer: %v", err) + } return } @@ -261,21 +364,9 @@ func (ns *DaeNetns) setupIPv6Datapath() (err error) { }); err != nil { return fmt.Errorf("failed to add v6 route to dae0peer: %v", err) } - return -} - -// updateNeigh() isn't named as setupNeigh() because it requires runtime.LockOSThread() -func (ns *DaeNetns) updateNeigh() (err error) { - runtime.LockOSThread() - defer runtime.UnlockOSThread() - - if err = netns.Set(ns.daeNs); err != nil { - return fmt.Errorf("failed to switch to daens: %v", err) - } - defer netns.Set(ns.hostNs) - + // (ip net e daens) ip n r fe80::ecee:eeff:feee:eeee dev dae0peer lladdr $mac_dae0 nud permanent if err = netlink.NeighSet(&netlink.Neigh{ - IP: net.ParseIP("169.254.0.1"), + IP: net.ParseIP("fe80::ecee:eeff:feee:eeee"), HardwareAddr: ns.dae0.Attrs().HardwareAddr, LinkIndex: ns.dae0peer.Attrs().Index, State: netlink.NUD_PERMANENT, @@ -285,30 +376,6 @@ func (ns *DaeNetns) updateNeigh() (err error) { return } -func (ns *DaeNetns) monitorDae0LinkAddr() { - ch := make(chan netlink.LinkUpdate) - done := make(chan struct{}) - defer close(done) - - err := netlink.LinkSubscribe(ch, done) - if err != nil { - ns.log.Errorf("failed to subscribe link updates: %v", err) - } - if ns.dae0, err = netlink.LinkByName(HostVethName); err != nil { - ns.log.Errorf("failed to get link dae0: %v", err) - } - if err = ns.updateNeigh(); err != nil { - ns.log.Errorf("failed to update neigh: %v", err) - } - for msg := range ch { - if msg.Link.Attrs().Name == HostVethName && !bytes.Equal(msg.Link.Attrs().HardwareAddr, ns.dae0.Attrs().HardwareAddr) { - ns.log.WithField("old addr", ns.dae0.Attrs().HardwareAddr).WithField("new addr", msg.Link.Attrs().HardwareAddr).Info("dae0 link addr changed") - ns.dae0 = msg.Link - ns.updateNeigh() - } - } -} - func DeleteNamedNetns(name string) error { namedPath := path.Join("/run/netns", name) unix.Unmount(namedPath, unix.MNT_DETACH|unix.MNT_FORCE) diff --git a/control/udp.go b/control/udp.go index d996fce77..86456f366 100644 --- a/control/udp.go +++ b/control/udp.go @@ -6,11 +6,9 @@ package control import ( - "errors" "fmt" "net" "net/netip" - "syscall" "time" "github.com/daeuniverse/dae/common" @@ -50,23 +48,7 @@ func ChooseNatTimeout(data []byte, sniffDns bool) (dmsg *dnsmessage.Msg, timeout // sendPkt uses bind first, and fallback to send hdr if addr is in use. func sendPkt(log *logrus.Logger, data []byte, from netip.AddrPort, realTo, to netip.AddrPort, lConn *net.UDPConn) (err error) { - - transparentTimeout := AnyfromTimeout - if from.Port() == 53 { - // Add port 53 (udp) to whitelist to avoid conflicts with the potential local dns server. - transparentTimeout = 0 - } - uConn, _, err := DefaultAnyfromPool.GetOrCreate(from.String(), transparentTimeout) - if err != nil && errors.Is(err, syscall.EADDRINUSE) { - log.WithField("from", from). - WithField("to", to). - WithField("realTo", realTo). - Trace("Port in use, fallback to use netns.") - err = GetDaeNetns().With(func() (err error) { - uConn, _, err = DefaultAnyfromPool.GetOrCreate(from.String(), AnyfromTimeout) - return err - }) - } + uConn, _, err := DefaultAnyfromPool.GetOrCreate(from.String(), AnyfromTimeout) if err != nil { return } diff --git a/docs/en/README.md b/docs/en/README.md index d6c63e104..44cde1b36 100644 --- a/docs/en/README.md +++ b/docs/en/README.md @@ -162,7 +162,6 @@ global { log_level: info allow_insecure: false auto_config_kernel_parameter: true - auto_config_firewall_rule: true } subscription { diff --git a/docs/en/how-it-works.md b/docs/en/how-it-works.md index 6a62ceb06..90c93f6a9 100644 --- a/docs/en/how-it-works.md +++ b/docs/en/how-it-works.md @@ -37,6 +37,8 @@ The proxy mechanism of dae is akin to other programs. However, when binding to t In terms of benchmarking, dae's proxy performance slightly surpasses that of other proxy programs, but the difference is not significant. +As of [PR:implement stack bypass](https://github.com/daeuniverse/dae/pull/458), the hijack datapath has been changed to bypass stack for better performance and less stack influence (e.g. netfilter, systemd-sysctl). Please refer to the PR description for better understanding. + ### Direct Connection Mechanism Conventionally, traffic splitting involves passing traffic through a proxy program, navigating the splitting module, and then determining whether to use a proxy or establish a direct connection. This process requires parsing, processing, and copying traffic through the network stack, delivering it to the proxy program, and subsequently copying, processing, and encapsulating it through the network stack before sending it out. This consumes substantial resources. Particularly in scenarios like BitTorrent downloads, even if a direct connection is set, it still consumes numerous connections, ports, memory, and CPU resources. It might even impact NAT type in gaming situations due to the proxy program's inadequate handling, resulting in connection errors. diff --git a/docs/zh/README.md b/docs/zh/README.md index ebd475b12..8c7eca4df 100644 --- a/docs/zh/README.md +++ b/docs/zh/README.md @@ -156,7 +156,6 @@ global { log_level: info allow_insecure: false auto_config_kernel_parameter: true - auto_config_firewall_rule: true } subscription { diff --git a/example.dae b/example.dae index 791e1cf31..6512b595f 100644 --- a/example.dae +++ b/example.dae @@ -34,11 +34,6 @@ global { # https://github.com/daeuniverse/dae/blob/main/docs/en/user-guide/kernel-parameters.md to see what will dae do. auto_config_kernel_parameter: true - # Automatically configure firewall rules like firewalld and fw4. - # firewalld: nft 'insert rule inet firewalld filter_INPUT mark 0x08000000 accept' - # fw4: nft 'insert rule inet fw4 input mark 0x08000000 accept' - auto_config_firewall_rule: true - ##### Node connectivity check. # Host of URL should have both IPv4 and IPv6 if you have double stack in local. diff --git a/go.mod b/go.mod index 5ed5365ee..0596051dd 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/daeuniverse/dae-config-dist/go/dae_config v0.0.0-20230604120805-1c27619b592d github.com/daeuniverse/outbound v0.0.0-20240101085641-7932e7df927d github.com/daeuniverse/softwind v0.0.0-20231230065827-eed67f20d2c1 + github.com/fsnotify/fsnotify v1.7.0 github.com/json-iterator/go v1.1.12 github.com/miekg/dns v1.1.55 github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 @@ -21,6 +22,7 @@ require ( github.com/spf13/cobra v1.7.0 github.com/v2rayA/ahocorasick-domain v0.0.0-20231231085011-99ceb8ef3208 github.com/vishvananda/netlink v1.1.0 + github.com/vishvananda/netns v0.0.4 github.com/x-cray/logrus-prefixed-formatter v0.5.2 golang.org/x/crypto v0.12.0 golang.org/x/exp v0.0.0-20230728194245-b0cb94b80691 @@ -54,7 +56,6 @@ require ( github.com/dgryski/go-metro v0.0.0-20211217172704-adc40b04c140 // indirect github.com/dgryski/go-rc2 v0.0.0-20150621095337-8a9021637152 // indirect github.com/eknkc/basex v1.0.1 // indirect - github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/uuid v1.3.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect @@ -68,7 +69,6 @@ require ( github.com/refraction-networking/utls v1.4.3 // indirect github.com/seiflotfy/cuckoofilter v0.0.0-20220411075957-e3b120b3f5fb // indirect github.com/spf13/pflag v1.0.5 // indirect - github.com/vishvananda/netns v0.0.4 // indirect gitlab.com/yawning/chacha20.git v0.0.0-20230427033715-7877545b1b37 // indirect golang.org/x/term v0.11.0 // indirect golang.org/x/text v0.12.0 // indirect diff --git a/go.sum b/go.sum index be1a57a16..cbb802385 100644 --- a/go.sum +++ b/go.sum @@ -8,8 +8,6 @@ github.com/bits-and-blooms/bitset v1.8.0 h1:FD+XqgOZDUxxZ8hzoBFuV9+cGWY9CslN6d5M github.com/bits-and-blooms/bitset v1.8.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bloom/v3 v3.5.0 h1:AKDvi1V3xJCmSR6QhcBfHbCN4Vf8FfxeWkMNQfmAGhY= github.com/bits-and-blooms/bloom/v3 v3.5.0/go.mod h1:Y8vrn7nk1tPIlmLtW2ZPV+W7StdVMor6bC1xgpjMZFs= -github.com/cilium/ebpf v0.11.0 h1:V8gS/bTCCjX9uUnkUFUpPsksM8n1lXBAvHcpiFk1X2Y= -github.com/cilium/ebpf v0.11.0/go.mod h1:WE7CZAnqOL2RouJ4f1uyNhqr2P4CCvXFIqdRDUgWsVs= github.com/cilium/ebpf v0.12.3 h1:8ht6F9MquybnY97at+VDZb3eQQr8ev79RueWeVaEcG4= github.com/cilium/ebpf v0.12.3/go.mod h1:TctK1ivibvI3znr66ljgi4hqOT8EYQjz1KWBfb1UVgM= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= @@ -38,7 +36,6 @@ github.com/eknkc/basex v1.0.1/go.mod h1:k/F/exNEHFdbs3ZHuasoP2E7zeWwZblG84Y7Z59v github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA= github.com/frankban/quicktest v1.14.5/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= -github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= @@ -200,8 +197,6 @@ golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM= -golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.14.1-0.20231108175955-e4099bfacb8c h1:3kC/TjQ+xzIblQv39bCOyRk8fbEeJcDHwbyxPUU2BpA= golang.org/x/sys v0.14.1-0.20231108175955-e4099bfacb8c/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.11.0 h1:F9tnn/DA/Im8nCwm+fX+1/eBwi4qFjRT++MhtVC4ZX0=