Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BGP capabilities to the NAT GW #4285

Merged
merged 15 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions charts/kube-ovn/templates/kube-ovn-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,31 @@ spec:
type: string
qosPolicy:
type: string
bgpSpeaker:
type: object
properties:
enabled:
type: boolean
asn:
type: integer
remoteAsn:
type: integer
neighbors:
type: array
items:
type: string
holdTime:
type: string
routerId:
type: string
password:
type: string
enableGracefulRestart:
type: boolean
extraArgs:
type: array
items:
type: string
tolerations:
type: array
items:
Expand Down
25 changes: 25 additions & 0 deletions dist/images/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,31 @@ spec:
type: string
qosPolicy:
type: string
bgpSpeaker:
type: object
properties:
enabled:
type: boolean
asn:
type: integer
remoteAsn:
type: integer
neighbors:
type: array
items:
type: string
holdTime:
type: string
routerId:
type: string
password:
type: string
enableGracefulRestart:
type: boolean
extraArgs:
type: array
items:
type: string
tolerations:
type: array
items:
Expand Down
13 changes: 13 additions & 0 deletions pkg/apis/kubeovn/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,19 @@ type VpcNatSpec struct {
Tolerations []corev1.Toleration `json:"tolerations"`
Affinity corev1.Affinity `json:"affinity"`
QoSPolicy string `json:"qosPolicy"`
BgpSpeaker VpcBgpSpeaker `json:"bgpSpeaker"`
}

type VpcBgpSpeaker struct {
Enabled bool `json:"enabled"`
ASN uint32 `json:"asn"`
RemoteASN uint32 `json:"remoteAsn"`
Neighbors []string `json:"neighbors"`
HoldTime metav1.Duration `json:"holdTime"`
RouterID string `json:"routerId"`
Password string `json:"password"`
EnableGracefulRestart bool `json:"enableGracefulRestart"`
ExtraArgs []string `json:"extraArgs"`
}

type VpcNatStatus struct {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1112,7 +1112,7 @@ func (c *Controller) startWorkers(ctx context.Context) {
}, time.Second, ctx.Done())

go wait.Until(func() {
c.resyncVpcNatImage()
c.resyncVpcNatConfig()
}, time.Second, ctx.Done())

go wait.Until(func() {
Expand Down
20 changes: 18 additions & 2 deletions pkg/controller/vpc_nat.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,36 @@ import (
"github.com/kubeovn/kube-ovn/pkg/util"
)

var vpcNatImage = ""
var (
vpcNatImage = ""
vpcNatGwBgpSpeakerImage = ""
vpcNatAPINadName = ""
vpcNatAPINadProvider = ""
)

func (c *Controller) resyncVpcNatImage() {
func (c *Controller) resyncVpcNatConfig() {
cm, err := c.configMapsLister.ConfigMaps(c.config.PodNamespace).Get(util.VpcNatConfig)
if err != nil {
err = fmt.Errorf("failed to get ovn-vpc-nat-config, %w", err)
klog.Error(err)
return
}

// Image we're using to provision the NAT gateways
image, exist := cm.Data["image"]
if !exist {
err = fmt.Errorf("%s should have image field", util.VpcNatConfig)
klog.Error(err)
return
}
vpcNatImage = image

// Image for the BGP sidecar of the gateway (optional)
vpcNatGwBgpSpeakerImage = cm.Data["bgpSpeakerImage"]

// NetworkAttachmentDefinition name for the BGP speaker to call the API server
vpcNatAPINadName = cm.Data["apiNadName"]

// NetworkAttachmentDefinition provider for the BGP speaker to call the API server
vpcNatAPINadProvider = cm.Data["apiNadProvider"]
}
161 changes: 161 additions & 0 deletions pkg/controller/vpc_nat_gateway.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"maps"
"os"
"reflect"
"regexp"
"slices"
Expand Down Expand Up @@ -735,6 +736,52 @@ func (c *Controller) execNatGwRules(pod *corev1.Pod, operation string, rules []s
return nil
}

func (c *Controller) setNatGwInterface(annotations map[string]string, externalNetwork string, defaultSubnet *kubeovnv1.Subnet) error {
if vpcNatAPINadName == "" {
return errors.New("no NetworkAttachmentDefinition provided to access apiserver, check configmap ovn-vpc-nat-config and field 'apiNadName'")
}

nad := fmt.Sprintf("%s/%s, %s/%s", c.config.PodNamespace, externalNetwork, corev1.NamespaceDefault, vpcNatAPINadName)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the NAD namspace is always default, there is no need to set both apiNadName and apiNadProvider in the configMap since <apiNadProvider> equals <apiNadName>.default.ovn or <apiNadName>.default.

How about getting NAD name and namespace from the NAD provider?

annotations[util.AttachmentNetworkAnnotation] = nad

return setNatGwRoute(annotations, defaultSubnet.Spec.Gateway)
}

func setNatGwRoute(annotations map[string]string, subnetGw string) error {
dst := os.Getenv("KUBERNETES_SERVICE_HOST")

protocol := util.CheckProtocol(dst)
if !strings.ContainsRune(dst, '/') {
switch protocol {
case kubeovnv1.ProtocolIPv4:
dst = fmt.Sprintf("%s/32", dst)
case kubeovnv1.ProtocolIPv6:
dst = fmt.Sprintf("%s/128", dst)
}
}

// Check the API NetworkAttachmentDefinition exists, otherwise we won't be able to attach
// the BGP speaker to a network that has access to the K8S apiserver (and won't be able to detect EIPs)
if vpcNatAPINadProvider == "" {
return errors.New("no NetworkAttachmentDefinition provided to access apiserver, check configmap ovn-vpc-nat-config and field 'apiNadName'")
}

for _, gw := range strings.Split(subnetGw, ",") {
if util.CheckProtocol(gw) == protocol {
routes := []request.Route{{Destination: dst, Gateway: gw}}
buf, err := json.Marshal(routes)
if err != nil {
return fmt.Errorf("failed to marshal routes %+v: %w", routes, err)
}

annotations[fmt.Sprintf(util.RoutesAnnotationTemplate, vpcNatAPINadProvider)] = string(buf)
break
}
}

return nil
}

func (c *Controller) genNatGwStatefulSet(gw *kubeovnv1.VpcNatGateway, oldSts *v1.StatefulSet) (*v1.StatefulSet, error) {
annotations := make(map[string]string, 7)
if oldSts != nil && len(oldSts.Annotations) != 0 {
Expand All @@ -747,6 +794,18 @@ func (c *Controller) genNatGwStatefulSet(gw *kubeovnv1.VpcNatGateway, oldSts *v1
util.LogicalSwitchAnnotation: gw.Spec.Subnet,
util.IPAddressAnnotation: gw.Spec.LanIP,
}

if gw.Spec.BgpSpeaker.Enabled { // Add an interface that can reach the API server
defaultSubnet, err := c.subnetsLister.Get(c.config.DefaultLogicalSwitch)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we have introduced configMap option apiNadProvider, we should use the nad instead of hard-coded default subnet.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should use function findSubnetByNetworkAttachmentDefinition instead of using the default subnet directly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Kubernetes apiserver runs in the default subnet doesn't it? The NAD just happens to be connected to that subnet. Do you want me to:

  • Lookup every subnet in the cluster
  • Determine which one has a provider equal to our NAD
  • Get its gateway

That can be an option

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Kubernetes apiserver runs in the default subnet doesn't it?

K8s apiserver runs in control plane nodes with host network. It can be accessed from subnets in the default vpc.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The NAD can be pointed to ANY subnet which is running in the default vpc.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, should kube-ovn have a default NAD installed in the default VPC for kube-dns and the NAT GW? That would be extremely handy.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is an example I'm using:

apiVersion: "k8s.cni.cncf.io/v1"
kind: NetworkAttachmentDefinition
metadata:
  name: ovn-nad
  namespace: default
spec:
  config: '{
      "cniVersion": "0.3.0",
      "type": "kube-ovn",
      "server_socket": "/run/openvswitch/kube-ovn-daemon.sock",
      "provider": "ovn-nad.default.ovn"
    }'
---
apiVersion: kubeovn.io/v1
kind: Subnet
metadata:
  name: vpc-apiserver-subnet
spec:
  protocol: IPv4
  cidrBlock: 100.100.100.0/24
  provider: ovn-nad.default.ovn

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should kube-ovn have a default NAD installed in the default VPC for kube-dns and the NAT GW?

NAD resources can be created only when the CRD is installed in the cluster, while the CRD should be installed by users.

if err != nil {
return nil, fmt.Errorf("failed to get default subnet %s: %w", c.config.DefaultLogicalSwitch, err)
}

if err := c.setNatGwInterface(podAnnotations, nadName, defaultSubnet); err != nil {
return nil, err
}
}

for key, value := range podAnnotations {
annotations[key] = value
}
Expand Down Expand Up @@ -782,6 +841,7 @@ func (c *Controller) genNatGwStatefulSet(gw *kubeovnv1.VpcNatGateway, oldSts *v1
routes = append(routes, request.Route{Destination: cidrV6, Gateway: v6Gateway})
}
}

if err = setPodRoutesAnnotation(annotations, util.OvnProvider, routes); err != nil {
klog.Error(err)
return nil, err
Expand Down Expand Up @@ -820,6 +880,7 @@ func (c *Controller) genNatGwStatefulSet(gw *kubeovnv1.VpcNatGateway, oldSts *v1
"app": name,
util.VpcNatGatewayLabel: "true",
}

sts := &v1.StatefulSet{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Expand Down Expand Up @@ -859,6 +920,106 @@ func (c *Controller) genNatGwStatefulSet(gw *kubeovnv1.VpcNatGateway, oldSts *v1
},
},
}

// BGP speaker for GWs must be enabled globally and for this specific instance
if gw.Spec.BgpSpeaker.Enabled {
containers := sts.Spec.Template.Spec.Containers

// We need a speaker image configured in the NAT GW ConfigMap
if vpcNatGwBgpSpeakerImage == "" {
return nil, fmt.Errorf("%s should have bgp speaker image field if bgp enabled", util.VpcNatConfig)
}

args := []string{
"--nat-gw-mode", // Force to run in NAT GW mode, we're not announcing Pod IPs or Services, only EIPs
}

speakerParams := gw.Spec.BgpSpeaker

if speakerParams.RouterID != "" { // Override default auto-selected RouterID
args = append(args, fmt.Sprintf("--router-id=%s", speakerParams.RouterID))
}

if speakerParams.Password != "" { // Password for TCP MD5 BGP
args = append(args, fmt.Sprintf("--auth-password=%s", speakerParams.Password))
}

if speakerParams.EnableGracefulRestart { // Enable graceful restart
args = append(args, "--graceful-restart")
}

if speakerParams.HoldTime != (metav1.Duration{}) { // Hold time
args = append(args, fmt.Sprintf("--holdtime=%s", speakerParams.HoldTime.Duration.String()))
}

if speakerParams.ASN == 0 { // The ASN we use to speak
return nil, errors.New("ASN not set, but must be non-zero value")
}

if speakerParams.RemoteASN == 0 { // The ASN we speak to
return nil, errors.New("remote ASN not set, but must be non-zero value")
}

args = append(args, fmt.Sprintf("--cluster-as=%d", speakerParams.ASN))
args = append(args, fmt.Sprintf("--neighbor-as=%d", speakerParams.RemoteASN))

if len(speakerParams.Neighbors) == 0 {
return nil, errors.New("no BGP neighbors specified")
}

var neighIPv4 []string
var neighIPv6 []string
for _, neighbor := range speakerParams.Neighbors {
switch util.CheckProtocol(neighbor) {
case kubeovnv1.ProtocolIPv4:
neighIPv4 = append(neighIPv4, neighbor)
case kubeovnv1.ProtocolIPv6:
neighIPv6 = append(neighIPv6, neighbor)
}
}

argNeighIPv4 := strings.Join(neighIPv4, ",")
argNeighIPv6 := strings.Join(neighIPv6, ",")
argNeighIPv4 = fmt.Sprintf("--neighbor-address=%s", argNeighIPv4)
argNeighIPv6 = fmt.Sprintf("--neighbor-ipv6-address=%s", argNeighIPv6)

if len(neighIPv4) > 0 {
args = append(args, argNeighIPv4)
}

if len(neighIPv6) > 0 {
args = append(args, argNeighIPv6)
}

// Extra args to start the speaker with, for example, logging levels...
args = append(args, speakerParams.ExtraArgs...)

sts.Spec.Template.Spec.ServiceAccountName = "vpc-nat-gw"
speakerContainer := corev1.Container{
Name: "vpc-nat-gw-speaker",
Image: vpcNatGwBgpSpeakerImage,
Command: []string{"/kube-ovn/kube-ovn-speaker"},
ImagePullPolicy: corev1.PullIfNotPresent,
Env: []corev1.EnvVar{
{
Name: util.GatewayNameEnv,
Value: gw.Name,
},
{
Name: "POD_IP",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "status.podIP",
},
},
},
},
Args: args,
}

sts.Spec.Template.Spec.Containers = append(containers, speakerContainer)
}

return sts, nil
}

Expand Down
Loading
Loading