Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backend/vxlan: simplify vxlan processing #785

Merged
merged 1 commit into from
Aug 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 37 additions & 114 deletions backend/vxlan/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,10 @@ package vxlan
import (
"fmt"
"net"
"os"
"syscall"
"time"

log "github.com/golang/glog"
"github.com/vishvananda/netlink"
"github.com/vishvananda/netlink/nl"

"github.com/coreos/flannel/pkg/ip"
)
Expand All @@ -41,17 +38,6 @@ type vxlanDevice struct {
link *netlink.Vxlan
}

func sysctlSet(path, value string) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()

_, err = f.Write([]byte(value))
return err
}

func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
link := &netlink.Vxlan{
LinkAttrs: netlink.LinkAttrs{
Expand All @@ -69,12 +55,6 @@ func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
if err != nil {
return nil, err
}
// this enables ARP requests being sent to userspace via netlink
sysctlPath := fmt.Sprintf("/proc/sys/net/ipv4/neigh/%s/app_solicit", devAttrs.name)
if err := sysctlSet(sysctlPath, "3"); err != nil {
return nil, err
}

return &vxlanDevice{
link: link,
}, nil
Expand All @@ -84,13 +64,15 @@ func ensureLink(vxlan *netlink.Vxlan) (*netlink.Vxlan, error) {
err := netlink.LinkAdd(vxlan)
if err == syscall.EEXIST {
// it's ok if the device already exists as long as config is similar
log.V(1).Infof("VXLAN device already exists")
existing, err := netlink.LinkByName(vxlan.Name)
if err != nil {
return nil, err
}

incompat := vxlanLinksIncompat(vxlan, existing)
if incompat == "" {
log.V(1).Infof("Returning existing device")
return existing.(*netlink.Vxlan), nil
}

Expand Down Expand Up @@ -123,28 +105,37 @@ func ensureLink(vxlan *netlink.Vxlan) (*netlink.Vxlan, error) {
}

func (dev *vxlanDevice) Configure(ipn ip.IP4Net) error {
setAddr4(dev.link, ipn.ToIPNet())
addr := netlink.Addr{IPNet: ipn.ToIPNet()}
existingAddrs, err := netlink.AddrList(dev.link, netlink.FAMILY_V4)
if err != nil {
return err
}

if err := netlink.LinkSetUp(dev.link); err != nil {
return fmt.Errorf("failed to set interface %s to UP state: %s", dev.link.Attrs().Name, err)
// flannel will never make this happen. This situation can only be caused by a user, so get them to sort it out.
if len(existingAddrs) > 1 {
return fmt.Errorf("link has incompatible addresses. Remove additional addresses and try again. %s", dev.link)
}

// explicitly add a route since there might be a route for a subnet already
// installed by Docker and then it won't get auto added
route := netlink.Route{
LinkIndex: dev.link.Attrs().Index,
Scope: netlink.SCOPE_UNIVERSE,
Dst: ipn.Network().ToIPNet(),
// If the device has an incompatible address then delete it. This can happen if the lease changes for example.
if len(existingAddrs) == 1 && !existingAddrs[0].Equal(addr) {
if err := netlink.AddrDel(dev.link, &existingAddrs[0]); err != nil {
return fmt.Errorf("failed to remove IP address %s from %s: %s", ipn.String(), dev.link.Attrs().Name, err)
}
existingAddrs = []netlink.Addr{}
}
if err := netlink.RouteAdd(&route); err != nil && err != syscall.EEXIST {
return fmt.Errorf("failed to add route (%s -> %s): %v", ipn.Network().String(), dev.link.Attrs().Name, err)

// Actually add the desired address to the interface if needed.
if len(existingAddrs) == 0 {
if err := netlink.AddrAdd(dev.link, &addr); err != nil {
return fmt.Errorf("failed to add IP address %s to %s: %s", ipn.String(), dev.link.Attrs().Name, err)
}
}

return nil
}
if err := netlink.LinkSetUp(dev.link); err != nil {
return fmt.Errorf("failed to set interface %s to UP state: %s", dev.link.Attrs().Name, err)
}

func (dev *vxlanDevice) Destroy() {
netlink.LinkDel(dev.link)
return nil
}

func (dev *vxlanDevice) MACAddr() net.HardwareAddr {
Expand All @@ -160,14 +151,9 @@ type neighbor struct {
IP ip.IP4
}

func (dev *vxlanDevice) GetL2List() ([]netlink.Neigh, error) {
log.V(4).Infof("calling GetL2List() dev.link.Index: %d ", dev.link.Index)
return netlink.NeighList(dev.link.Index, syscall.AF_BRIDGE)
}

func (dev *vxlanDevice) AddL2(n neighbor) error {
log.V(4).Infof("calling NeighAdd: %v, %v", n.IP, n.MAC)
return netlink.NeighAdd(&netlink.Neigh{
func (dev *vxlanDevice) AddFDB(n neighbor) error {
log.V(4).Infof("calling AddFDB: %v, %v", n.IP, n.MAC)
return netlink.NeighSet(&netlink.Neigh{
LinkIndex: dev.link.Index,
State: netlink.NUD_PERMANENT,
Family: syscall.AF_BRIDGE,
Expand All @@ -177,8 +163,8 @@ func (dev *vxlanDevice) AddL2(n neighbor) error {
})
}

func (dev *vxlanDevice) DelL2(n neighbor) error {
log.V(4).Infof("calling NeighDel: %v, %v", n.IP, n.MAC)
func (dev *vxlanDevice) DelFDB(n neighbor) error {
log.V(4).Infof("calling DelFDB: %v, %v", n.IP, n.MAC)
return netlink.NeighDel(&netlink.Neigh{
LinkIndex: dev.link.Index,
Family: syscall.AF_BRIDGE,
Expand All @@ -188,77 +174,28 @@ func (dev *vxlanDevice) DelL2(n neighbor) error {
})
}

func (dev *vxlanDevice) AddL3(n neighbor) error {
log.V(4).Infof("calling NeighSet: %v, %v", n.IP, n.MAC)
func (dev *vxlanDevice) AddARP(n neighbor) error {
log.V(4).Infof("calling AddARP: %v, %v", n.IP, n.MAC)
return netlink.NeighSet(&netlink.Neigh{
LinkIndex: dev.link.Index,
State: netlink.NUD_REACHABLE,
State: netlink.NUD_PERMANENT,
Type: syscall.RTN_UNICAST,
IP: n.IP.ToIP(),
HardwareAddr: n.MAC,
})
}

func (dev *vxlanDevice) DelL3(n neighbor) error {
log.V(4).Infof("calling NeighDel: %v, %v", n.IP, n.MAC)
func (dev *vxlanDevice) DelARP(n neighbor) error {
log.V(4).Infof("calling DelARP: %v, %v", n.IP, n.MAC)
return netlink.NeighDel(&netlink.Neigh{
LinkIndex: dev.link.Index,
State: netlink.NUD_REACHABLE,
State: netlink.NUD_PERMANENT,
Type: syscall.RTN_UNICAST,
IP: n.IP.ToIP(),
HardwareAddr: n.MAC,
})
}

func (dev *vxlanDevice) MonitorMisses(misses chan *netlink.Neigh) {
nlsock, err := nl.Subscribe(syscall.NETLINK_ROUTE, syscall.RTNLGRP_NEIGH)
if err != nil {
log.Error("Failed to subscribe to netlink RTNLGRP_NEIGH messages")
return
}

for {
msgs, err := nlsock.Receive()
if err != nil {
log.Errorf("Failed to receive from netlink: %v ", err)

time.Sleep(1 * time.Second)
continue
}

for _, msg := range msgs {
dev.processNeighMsg(msg, misses)
}
}
}

func isNeighResolving(state int) bool {
return (state & (netlink.NUD_INCOMPLETE | netlink.NUD_STALE | netlink.NUD_DELAY | netlink.NUD_PROBE)) != 0
}

func (dev *vxlanDevice) processNeighMsg(msg syscall.NetlinkMessage, misses chan *netlink.Neigh) {
neigh, err := netlink.NeighDeserialize(msg.Data)
if err != nil {
log.Error("Failed to deserialize netlink ndmsg: %v", err)
return
}

if neigh.LinkIndex != dev.link.Index {
return
}

if msg.Header.Type != syscall.RTM_GETNEIGH && msg.Header.Type != syscall.RTM_NEWNEIGH {
return
}

if !isNeighResolving(neigh.State) {
// misses come with NUD_STALE bit set
return
}

misses <- neigh
}

func vxlanLinksIncompat(l1, l2 netlink.Link) string {
if l1.Type() != l2.Type() {
return fmt.Sprintf("link type: %v vs %v", l1.Type(), l2.Type())
Expand Down Expand Up @@ -297,17 +234,3 @@ func vxlanLinksIncompat(l1, l2 netlink.Link) string {

return ""
}

// sets IP4 addr on link
func setAddr4(link *netlink.Vxlan, ipn *net.IPNet) error {
// Ensure that the device has a /32 address so that no broadcast routes are created.
// This IP is just used as a source address for host to workload traffic (so
// the return path for the traffic has a decent address to use as the destination)
ipn.Mask = net.CIDRMask(32, 32)
addr := netlink.Addr{IPNet: ipn, Label: ""}
if err := netlink.AddrAdd(link, &addr); err != nil {
return fmt.Errorf("failed to add IP address %s to %s: %s", ipn.String(), link.Attrs().Name, err)
}

return nil
}
57 changes: 0 additions & 57 deletions backend/vxlan/routes.go

This file was deleted.

51 changes: 40 additions & 11 deletions backend/vxlan/vxlan.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,40 @@

package vxlan

// Some design notes and history:
// VXLAN encapsulates L2 packets (though flannel is L3 only so don't expect to be able to send L2 packets across hosts)
// The first versions of vxlan for flannel registered the flannel daemon as a handler for both "L2" and "L3" misses
// - When a container sends a packet to a new IP address on the flannel network (but on a different host) this generates
// an L2 miss (i.e. an ARP lookup)
// - The flannel daemon knows which flannel host the packet is destined for so it can supply the VTEP MAC to use.
// This is stored in the ARP table (with a timeout) to avoid constantly looking it up.
// - The packet can then be encapsulated but the host needs to know where to send it. This creates another callout from
// the kernal vxlan code to the flannel daemon to get the public IP that should be used for that VTEP (this gets called
// an L3 miss). The L2/L3 miss hooks are registered when the vxlan device is created. At the same time a device route
// is created to the whole flannel network so that non-local traffic is sent over the vxlan device.
//
// In this scheme the scaling of table entries (per host) is:
// - 1 route (for the configured network out the vxlan device)
// - One arp entry for each remote container that this host has recently contacted
// - One FDB entry for each remote host
//
// The second version of flannel vxlan removed the need for the L3MISS callout. When a new remote host is found (either
// during startup or when it's created), flannel simply adds the required entries so that no further lookup/callout is required.
//
//
// The latest version of the vxlan backend removes the need for the L2MISS too, which means that the flannel deamon is not
// listening for any netlink messages anymore. This improves reliability (no problems with timeouts if
// flannel crashes or restarts) and simplifies upgrades.
//
// How it works:
// Create the vxlan device but don't register for any L2MISS or L3MISS messages
// Then, as each remote host is discovered (either on startup or when they are added), do the following
// 1) create routing table entry for the remote subnet. It goes via the vxlan device but also specifies a next hop (of the remote flannel host).
// 2) Create a static ARP entry for the remote flannel host IP address (and the VTEP MAC)
// 3) Create an FDB entry with the VTEP MAC and the public IP of the remote flannel daemon.
//
// In this scheme the scaling of table entries is linear to the number of remote hosts - 1 route, 1 arp entry and 1 FDB entry per host

import (
"encoding/json"
"fmt"
Expand Down Expand Up @@ -99,25 +133,20 @@ func (be *VXLANBackend) RegisterNetwork(ctx context.Context, config *subnet.Conf
lease, err := be.subnetMgr.AcquireLease(ctx, subnetAttrs)
switch err {
case nil:

case context.Canceled, context.DeadlineExceeded:
return nil, err

default:
return nil, fmt.Errorf("failed to acquire lease: %v", err)
}

// vxlan's subnet is that of the whole overlay network (e.g. /16)
// and not that of the individual host (e.g. /24)
vxlanNet := ip.IP4Net{
IP: lease.Subnet.IP,
PrefixLen: config.Network.PrefixLen,
}
if err = dev.Configure(vxlanNet); err != nil {
return nil, err
// Ensure that the device has a /32 address so that no broadcast routes are created.
// This IP is just used as a source address for host to workload traffic (so
// the return path for the traffic has an address on the flannel network to use as the destination)
if err := dev.Configure(ip.IP4Net{IP: lease.Subnet.IP, PrefixLen: 32}); err != nil {
return nil, fmt.Errorf("failed to configure interface %s: %s", dev.link.Attrs().Name, err)
}

return newNetwork(be.subnetMgr, be.extIface, dev, vxlanNet, lease)
return newNetwork(be.subnetMgr, be.extIface, dev, ip.IP4Net{}, lease)
}

// So we can make it JSON (un)marshalable
Expand Down
Loading