Skip to content

Commit

Permalink
Merge pull request #785 from tomdee/vxlan
Browse files Browse the repository at this point in the history
backend/vxlan: simplify vxlan processing
  • Loading branch information
tomdee authored Aug 14, 2017
2 parents b119b77 + 5d3d664 commit 4973e02
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 327 deletions.
151 changes: 37 additions & 114 deletions backend/vxlan/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,10 @@ package vxlan
import (
"fmt"
"net"
"os"
"syscall"
"time"

log "github.com/golang/glog"
"github.com/vishvananda/netlink"
"github.com/vishvananda/netlink/nl"

"github.com/coreos/flannel/pkg/ip"
)
Expand All @@ -41,17 +38,6 @@ type vxlanDevice struct {
link *netlink.Vxlan
}

func sysctlSet(path, value string) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()

_, err = f.Write([]byte(value))
return err
}

func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
link := &netlink.Vxlan{
LinkAttrs: netlink.LinkAttrs{
Expand All @@ -69,12 +55,6 @@ func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
if err != nil {
return nil, err
}
// this enables ARP requests being sent to userspace via netlink
sysctlPath := fmt.Sprintf("/proc/sys/net/ipv4/neigh/%s/app_solicit", devAttrs.name)
if err := sysctlSet(sysctlPath, "3"); err != nil {
return nil, err
}

return &vxlanDevice{
link: link,
}, nil
Expand All @@ -84,13 +64,15 @@ func ensureLink(vxlan *netlink.Vxlan) (*netlink.Vxlan, error) {
err := netlink.LinkAdd(vxlan)
if err == syscall.EEXIST {
// it's ok if the device already exists as long as config is similar
log.V(1).Infof("VXLAN device already exists")
existing, err := netlink.LinkByName(vxlan.Name)
if err != nil {
return nil, err
}

incompat := vxlanLinksIncompat(vxlan, existing)
if incompat == "" {
log.V(1).Infof("Returning existing device")
return existing.(*netlink.Vxlan), nil
}

Expand Down Expand Up @@ -123,28 +105,37 @@ func ensureLink(vxlan *netlink.Vxlan) (*netlink.Vxlan, error) {
}

func (dev *vxlanDevice) Configure(ipn ip.IP4Net) error {
setAddr4(dev.link, ipn.ToIPNet())
addr := netlink.Addr{IPNet: ipn.ToIPNet()}
existingAddrs, err := netlink.AddrList(dev.link, netlink.FAMILY_V4)
if err != nil {
return err
}

if err := netlink.LinkSetUp(dev.link); err != nil {
return fmt.Errorf("failed to set interface %s to UP state: %s", dev.link.Attrs().Name, err)
// flannel will never make this happen. This situation can only be caused by a user, so get them to sort it out.
if len(existingAddrs) > 1 {
return fmt.Errorf("link has incompatible addresses. Remove additional addresses and try again. %s", dev.link)
}

// explicitly add a route since there might be a route for a subnet already
// installed by Docker and then it won't get auto added
route := netlink.Route{
LinkIndex: dev.link.Attrs().Index,
Scope: netlink.SCOPE_UNIVERSE,
Dst: ipn.Network().ToIPNet(),
// If the device has an incompatible address then delete it. This can happen if the lease changes for example.
if len(existingAddrs) == 1 && !existingAddrs[0].Equal(addr) {
if err := netlink.AddrDel(dev.link, &existingAddrs[0]); err != nil {
return fmt.Errorf("failed to remove IP address %s from %s: %s", ipn.String(), dev.link.Attrs().Name, err)
}
existingAddrs = []netlink.Addr{}
}
if err := netlink.RouteAdd(&route); err != nil && err != syscall.EEXIST {
return fmt.Errorf("failed to add route (%s -> %s): %v", ipn.Network().String(), dev.link.Attrs().Name, err)

// Actually add the desired address to the interface if needed.
if len(existingAddrs) == 0 {
if err := netlink.AddrAdd(dev.link, &addr); err != nil {
return fmt.Errorf("failed to add IP address %s to %s: %s", ipn.String(), dev.link.Attrs().Name, err)
}
}

return nil
}
if err := netlink.LinkSetUp(dev.link); err != nil {
return fmt.Errorf("failed to set interface %s to UP state: %s", dev.link.Attrs().Name, err)
}

func (dev *vxlanDevice) Destroy() {
netlink.LinkDel(dev.link)
return nil
}

func (dev *vxlanDevice) MACAddr() net.HardwareAddr {
Expand All @@ -160,14 +151,9 @@ type neighbor struct {
IP ip.IP4
}

func (dev *vxlanDevice) GetL2List() ([]netlink.Neigh, error) {
log.V(4).Infof("calling GetL2List() dev.link.Index: %d ", dev.link.Index)
return netlink.NeighList(dev.link.Index, syscall.AF_BRIDGE)
}

func (dev *vxlanDevice) AddL2(n neighbor) error {
log.V(4).Infof("calling NeighAdd: %v, %v", n.IP, n.MAC)
return netlink.NeighAdd(&netlink.Neigh{
func (dev *vxlanDevice) AddFDB(n neighbor) error {
log.V(4).Infof("calling AddFDB: %v, %v", n.IP, n.MAC)
return netlink.NeighSet(&netlink.Neigh{
LinkIndex: dev.link.Index,
State: netlink.NUD_PERMANENT,
Family: syscall.AF_BRIDGE,
Expand All @@ -177,8 +163,8 @@ func (dev *vxlanDevice) AddL2(n neighbor) error {
})
}

func (dev *vxlanDevice) DelL2(n neighbor) error {
log.V(4).Infof("calling NeighDel: %v, %v", n.IP, n.MAC)
func (dev *vxlanDevice) DelFDB(n neighbor) error {
log.V(4).Infof("calling DelFDB: %v, %v", n.IP, n.MAC)
return netlink.NeighDel(&netlink.Neigh{
LinkIndex: dev.link.Index,
Family: syscall.AF_BRIDGE,
Expand All @@ -188,77 +174,28 @@ func (dev *vxlanDevice) DelL2(n neighbor) error {
})
}

func (dev *vxlanDevice) AddL3(n neighbor) error {
log.V(4).Infof("calling NeighSet: %v, %v", n.IP, n.MAC)
func (dev *vxlanDevice) AddARP(n neighbor) error {
log.V(4).Infof("calling AddARP: %v, %v", n.IP, n.MAC)
return netlink.NeighSet(&netlink.Neigh{
LinkIndex: dev.link.Index,
State: netlink.NUD_REACHABLE,
State: netlink.NUD_PERMANENT,
Type: syscall.RTN_UNICAST,
IP: n.IP.ToIP(),
HardwareAddr: n.MAC,
})
}

func (dev *vxlanDevice) DelL3(n neighbor) error {
log.V(4).Infof("calling NeighDel: %v, %v", n.IP, n.MAC)
func (dev *vxlanDevice) DelARP(n neighbor) error {
log.V(4).Infof("calling DelARP: %v, %v", n.IP, n.MAC)
return netlink.NeighDel(&netlink.Neigh{
LinkIndex: dev.link.Index,
State: netlink.NUD_REACHABLE,
State: netlink.NUD_PERMANENT,
Type: syscall.RTN_UNICAST,
IP: n.IP.ToIP(),
HardwareAddr: n.MAC,
})
}

func (dev *vxlanDevice) MonitorMisses(misses chan *netlink.Neigh) {
nlsock, err := nl.Subscribe(syscall.NETLINK_ROUTE, syscall.RTNLGRP_NEIGH)
if err != nil {
log.Error("Failed to subscribe to netlink RTNLGRP_NEIGH messages")
return
}

for {
msgs, err := nlsock.Receive()
if err != nil {
log.Errorf("Failed to receive from netlink: %v ", err)

time.Sleep(1 * time.Second)
continue
}

for _, msg := range msgs {
dev.processNeighMsg(msg, misses)
}
}
}

func isNeighResolving(state int) bool {
return (state & (netlink.NUD_INCOMPLETE | netlink.NUD_STALE | netlink.NUD_DELAY | netlink.NUD_PROBE)) != 0
}

func (dev *vxlanDevice) processNeighMsg(msg syscall.NetlinkMessage, misses chan *netlink.Neigh) {
neigh, err := netlink.NeighDeserialize(msg.Data)
if err != nil {
log.Error("Failed to deserialize netlink ndmsg: %v", err)
return
}

if neigh.LinkIndex != dev.link.Index {
return
}

if msg.Header.Type != syscall.RTM_GETNEIGH && msg.Header.Type != syscall.RTM_NEWNEIGH {
return
}

if !isNeighResolving(neigh.State) {
// misses come with NUD_STALE bit set
return
}

misses <- neigh
}

func vxlanLinksIncompat(l1, l2 netlink.Link) string {
if l1.Type() != l2.Type() {
return fmt.Sprintf("link type: %v vs %v", l1.Type(), l2.Type())
Expand Down Expand Up @@ -297,17 +234,3 @@ func vxlanLinksIncompat(l1, l2 netlink.Link) string {

return ""
}

// sets IP4 addr on link
func setAddr4(link *netlink.Vxlan, ipn *net.IPNet) error {
// Ensure that the device has a /32 address so that no broadcast routes are created.
// This IP is just used as a source address for host to workload traffic (so
// the return path for the traffic has a decent address to use as the destination)
ipn.Mask = net.CIDRMask(32, 32)
addr := netlink.Addr{IPNet: ipn, Label: ""}
if err := netlink.AddrAdd(link, &addr); err != nil {
return fmt.Errorf("failed to add IP address %s to %s: %s", ipn.String(), link.Attrs().Name, err)
}

return nil
}
57 changes: 0 additions & 57 deletions backend/vxlan/routes.go

This file was deleted.

51 changes: 40 additions & 11 deletions backend/vxlan/vxlan.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,40 @@

package vxlan

// Some design notes and history:
// VXLAN encapsulates L2 packets (though flannel is L3 only so don't expect to be able to send L2 packets across hosts)
// The first versions of vxlan for flannel registered the flannel daemon as a handler for both "L2" and "L3" misses
// - When a container sends a packet to a new IP address on the flannel network (but on a different host) this generates
// an L2 miss (i.e. an ARP lookup)
// - The flannel daemon knows which flannel host the packet is destined for so it can supply the VTEP MAC to use.
// This is stored in the ARP table (with a timeout) to avoid constantly looking it up.
// - The packet can then be encapsulated but the host needs to know where to send it. This creates another callout from
// the kernal vxlan code to the flannel daemon to get the public IP that should be used for that VTEP (this gets called
// an L3 miss). The L2/L3 miss hooks are registered when the vxlan device is created. At the same time a device route
// is created to the whole flannel network so that non-local traffic is sent over the vxlan device.
//
// In this scheme the scaling of table entries (per host) is:
// - 1 route (for the configured network out the vxlan device)
// - One arp entry for each remote container that this host has recently contacted
// - One FDB entry for each remote host
//
// The second version of flannel vxlan removed the need for the L3MISS callout. When a new remote host is found (either
// during startup or when it's created), flannel simply adds the required entries so that no further lookup/callout is required.
//
//
// The latest version of the vxlan backend removes the need for the L2MISS too, which means that the flannel deamon is not
// listening for any netlink messages anymore. This improves reliability (no problems with timeouts if
// flannel crashes or restarts) and simplifies upgrades.
//
// How it works:
// Create the vxlan device but don't register for any L2MISS or L3MISS messages
// Then, as each remote host is discovered (either on startup or when they are added), do the following
// 1) create routing table entry for the remote subnet. It goes via the vxlan device but also specifies a next hop (of the remote flannel host).
// 2) Create a static ARP entry for the remote flannel host IP address (and the VTEP MAC)
// 3) Create an FDB entry with the VTEP MAC and the public IP of the remote flannel daemon.
//
// In this scheme the scaling of table entries is linear to the number of remote hosts - 1 route, 1 arp entry and 1 FDB entry per host

import (
"encoding/json"
"fmt"
Expand Down Expand Up @@ -99,25 +133,20 @@ func (be *VXLANBackend) RegisterNetwork(ctx context.Context, config *subnet.Conf
lease, err := be.subnetMgr.AcquireLease(ctx, subnetAttrs)
switch err {
case nil:

case context.Canceled, context.DeadlineExceeded:
return nil, err

default:
return nil, fmt.Errorf("failed to acquire lease: %v", err)
}

// vxlan's subnet is that of the whole overlay network (e.g. /16)
// and not that of the individual host (e.g. /24)
vxlanNet := ip.IP4Net{
IP: lease.Subnet.IP,
PrefixLen: config.Network.PrefixLen,
}
if err = dev.Configure(vxlanNet); err != nil {
return nil, err
// Ensure that the device has a /32 address so that no broadcast routes are created.
// This IP is just used as a source address for host to workload traffic (so
// the return path for the traffic has an address on the flannel network to use as the destination)
if err := dev.Configure(ip.IP4Net{IP: lease.Subnet.IP, PrefixLen: 32}); err != nil {
return nil, fmt.Errorf("failed to configure interface %s: %s", dev.link.Attrs().Name, err)
}

return newNetwork(be.subnetMgr, be.extIface, dev, vxlanNet, lease)
return newNetwork(be.subnetMgr, be.extIface, dev, ip.IP4Net{}, lease)
}

// So we can make it JSON (un)marshalable
Expand Down
Loading

0 comments on commit 4973e02

Please sign in to comment.