From 2b3a6988e003430487fe307d7915b2457e2a6550 Mon Sep 17 00:00:00 2001 From: Jayanth Varavani <1111446+jayanthvn@users.noreply.github.com> Date: Thu, 20 Jun 2024 22:45:01 +0000 Subject: [PATCH 1/3] Configure conntrack cache table size --- .go-version | 2 +- README.md | 17 ++++++++++++++++ controllers/policyendpoints_controller.go | 4 ++-- .../policyendpoints_controller_test.go | 6 +++--- go.mod | 4 ++-- go.sum | 8 ++++++-- main.go | 8 +++++++- pkg/config/controller_config.go | 20 ++++++++++++++++++- pkg/ebpf/bpf_client.go | 15 +++++++++++--- 9 files changed, 69 insertions(+), 15 deletions(-) diff --git a/.go-version b/.go-version index f124bfa..88863fd 100644 --- a/.go-version +++ b/.go-version @@ -1 +1 @@ -1.21.9 +1.21.11 diff --git a/README.md b/README.md index 1094593..223cba7 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,23 @@ Network Policy agent can operate in either IPv4 or IPv6 mode. Setting this flag **Note:** VPC CNI by default creates an egress only IPv4 interface for IPv6 pods and this network interface will not be secured by the Network policy feature. Network policies will only be enforced on the Pod's primary interface (i.e.,) `eth0`. If you want to block the egress IPv4 access, please disable the interface creation via [ENABLE_V4_EGRESS](https://github.com/aws/amazon-vpc-cni-k8s#enable_v4_egress-v1151) flag in VPC CNI. +#### `conntrack-cache-cleanup-period` (from v1.0.7+) + +Type: Integer + +Default: 300 + +Network Policy agent maintains a local conntrack cache. This configuration (in seconds) will determine how fast the local conntrack cache should be cleaned up from stale/expired entries. Based on the time interval set, network policy agent checks every entry in the local conntrack cache with kernel conntrack table and determine if the entry has to be deleted. + +#### `conntrack-table-cache-size` (from v1.1.3+) + +Type: Integer + +Default: 1024 * 256 + +Network Policy agent maintains a local conntrack cache. Ideally this should be of the same size as kernel conntrack table. Note, this should be configured on new nodes before enabling network policy or if network policy is already enabled the change in configuration would need a reload of the nodes. Dynamic update of conntrack map size would lead to traffic disruption hence we won't support it. The value supported is between 32K and 1024K. + + ## Network Policy Agent CLI The Amazon VPC CNI plugin for Kubernetes installs eBPF SDK collection of tools on the nodes. You can use the eBPF SDK tools to identify issues with network policies. For example, the following command lists the programs that are running on the node. diff --git a/controllers/policyendpoints_controller.go b/controllers/policyendpoints_controller.go index 2f151c4..014ea59 100644 --- a/controllers/policyendpoints_controller.go +++ b/controllers/policyendpoints_controller.go @@ -74,7 +74,7 @@ func prometheusRegister() { // NewPolicyEndpointsReconciler constructs new PolicyEndpointReconciler func NewPolicyEndpointsReconciler(k8sClient client.Client, log logr.Logger, - enablePolicyEventLogs, enableCloudWatchLogs bool, enableIPv6 bool, enableNetworkPolicy bool, conntrackTTL int) (*PolicyEndpointsReconciler, error) { + enablePolicyEventLogs, enableCloudWatchLogs bool, enableIPv6 bool, enableNetworkPolicy bool, conntrackTTL int, conntrackTableSize int) (*PolicyEndpointsReconciler, error) { r := &PolicyEndpointsReconciler{ k8sClient: k8sClient, log: log, @@ -89,7 +89,7 @@ func NewPolicyEndpointsReconciler(k8sClient client.Client, log logr.Logger, var err error if enableNetworkPolicy { r.ebpfClient, err = ebpf.NewBpfClient(&r.policyEndpointeBPFContext, r.nodeIP, - enablePolicyEventLogs, enableCloudWatchLogs, enableIPv6, conntrackTTL) + enablePolicyEventLogs, enableCloudWatchLogs, enableIPv6, conntrackTTL, conntrackTableSize) // Start prometheus prometheusRegister() diff --git a/controllers/policyendpoints_controller_test.go b/controllers/policyendpoints_controller_test.go index ef2297c..71adaa2 100644 --- a/controllers/policyendpoints_controller_test.go +++ b/controllers/policyendpoints_controller_test.go @@ -329,7 +329,7 @@ func TestDeriveIngressAndEgressFirewallRules(t *testing.T) { mockClient := mock_client.NewMockClient(ctrl) policyEndpointReconciler, _ := NewPolicyEndpointsReconciler(mockClient, logr.New(&log.NullLogSink{}), - false, false, false, false, 300) + false, false, false, false, 300, 262144) var policyEndpointsList []string policyEndpointsList = append(policyEndpointsList, tt.policyEndpointName) policyEndpointReconciler.podIdentifierToPolicyEndpointMap.Store(tt.podIdentifier, policyEndpointsList) @@ -748,7 +748,7 @@ func TestArePoliciesAvailableInLocalCache(t *testing.T) { mockClient := mock_client.NewMockClient(ctrl) policyEndpointReconciler, _ := NewPolicyEndpointsReconciler(mockClient, logr.New(&log.NullLogSink{}), - false, false, false, false, 300) + false, false, false, false, 300, 262144) var policyEndpointsList []string policyEndpointsList = append(policyEndpointsList, tt.policyEndpointName...) policyEndpointReconciler.podIdentifierToPolicyEndpointMap.Store(tt.podIdentifier, policyEndpointsList) @@ -994,7 +994,7 @@ func TestDeriveFireWallRulesPerPodIdentifier(t *testing.T) { mockClient := mock_client.NewMockClient(ctrl) policyEndpointReconciler, _ := NewPolicyEndpointsReconciler(mockClient, logr.New(&log.NullLogSink{}), - false, false, false, false, 300) + false, false, false, false, 300, 262144) var policyEndpointsList []string policyEndpointsList = append(policyEndpointsList, tt.policyEndpointName) policyEndpointReconciler.podIdentifierToPolicyEndpointMap.Store(tt.podIdentifier, policyEndpointsList) diff --git a/go.mod b/go.mod index b96c64f..f9f9413 100644 --- a/go.mod +++ b/go.mod @@ -1,10 +1,10 @@ module github.com/aws/aws-network-policy-agent -go 1.21 +go 1.21.11 require ( github.com/aws/amazon-vpc-cni-k8s v1.18.1 - github.com/aws/aws-ebpf-sdk-go v1.0.8 + github.com/aws/aws-ebpf-sdk-go v1.0.9 github.com/aws/aws-sdk-go v1.50.30 github.com/go-logr/logr v1.4.1 github.com/go-logr/zapr v1.3.0 diff --git a/go.sum b/go.sum index da3c68f..c1e68ed 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,11 @@ github.com/aws/amazon-vpc-cni-k8s v1.18.1 h1:u/OeBgnUUX6f3PCEOpA4dbG0+iZ71CnY6tEljjrl3iw= github.com/aws/amazon-vpc-cni-k8s v1.18.1/go.mod h1:m/J5GsxF0Th2iQTOE3ww4W9LFvwdC0tGyA9dIL4h6iQ= -github.com/aws/aws-ebpf-sdk-go v1.0.8 h1:GyfMwkfS6Z8+5FgqRWlq+Sa3J97Qyb4fVY3KPkkyTW0= -github.com/aws/aws-ebpf-sdk-go v1.0.8/go.mod h1:RR0L0fJn8cJGgRH6zEYU4N64j6aee5P8gpUUFgkUQMA= +github.com/aws/aws-ebpf-sdk-go v1.0.9-rc1 h1:vDtkvNEvdF8L+2/qBahIuyLvOTeQs+ToVbkGw4QGJvI= +github.com/aws/aws-ebpf-sdk-go v1.0.9-rc1/go.mod h1:6lwTHtNgTp/kQzx4pdnp09LJevvIVqYf0ce8pP2u66E= +github.com/aws/aws-ebpf-sdk-go v1.0.9-rc2 h1:W2mdC1KjMk/fh7jfF/YP6s+Y9FsiEYc33PdJVsfix1g= +github.com/aws/aws-ebpf-sdk-go v1.0.9-rc2/go.mod h1:SBy1vl1WXMingLbqPZfHd1VXTqB9cD473JwUfoEM+Qs= +github.com/aws/aws-ebpf-sdk-go v1.0.9 h1:FvkyeRUKNvbUFgzh+Ia7XbBb5U86dHW6dCrljt76Fao= +github.com/aws/aws-ebpf-sdk-go v1.0.9/go.mod h1:SBy1vl1WXMingLbqPZfHd1VXTqB9cD473JwUfoEM+Qs= github.com/aws/aws-sdk-go v1.50.30 h1:2OelKH1eayeaH7OuL1Y9Ombfw4HK+/k0fEnJNWjyLts= github.com/aws/aws-sdk-go v1.50.30/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= diff --git a/main.go b/main.go index bfb5017..0bfe158 100644 --- a/main.go +++ b/main.go @@ -83,10 +83,16 @@ func main() { os.Exit(1) } + err = ctrlConfig.ValidControllerFlags() + if err != nil{ + setupLog.Error(err, "Controller flags validation failed") + os.Exit(1) + } + ctx := ctrl.SetupSignalHandler() policyEndpointController, err := controllers.NewPolicyEndpointsReconciler(mgr.GetClient(), ctrl.Log.WithName("controllers").WithName("policyEndpoints"), ctrlConfig.EnablePolicyEventLogs, ctrlConfig.EnableCloudWatchLogs, - ctrlConfig.EnableIPv6, ctrlConfig.EnableNetworkPolicy, ctrlConfig.ConntrackCacheCleanupPeriod) + ctrlConfig.EnableIPv6, ctrlConfig.EnableNetworkPolicy, ctrlConfig.ConntrackCacheCleanupPeriod, ctrlConfig.ConntrackCacheTableSize) if err != nil { setupLog.Error(err, "unable to setup controller", "controller", "PolicyEndpoints init failed") os.Exit(1) diff --git a/pkg/config/controller_config.go b/pkg/config/controller_config.go index ad4b8ba..1feb1fe 100644 --- a/pkg/config/controller_config.go +++ b/pkg/config/controller_config.go @@ -1,6 +1,9 @@ package config -import "github.com/spf13/pflag" +import ( + "github.com/spf13/pflag" + "errors" +) const ( flagLogLevel = "log-level" @@ -10,11 +13,13 @@ const ( defaultLogFile = "/var/log/aws-routed-eni/network-policy-agent.log" defaultMaxConcurrentReconciles = 3 defaultConntrackCacheCleanupPeriod = 300 + defaultConntrackCacheTableSize = 256 * 1024 flagEnablePolicyEventLogs = "enable-policy-event-logs" flagEnableCloudWatchLogs = "enable-cloudwatch-logs" flagEnableIPv6 = "enable-ipv6" flagEnableNetworkPolicy = "enable-network-policy" flagConntrackCacheCleanupPeriod = "conntrack-cache-cleanup-period" + flagConntrackCacheTableSize = "conntrack-cache-table-size" ) // ControllerConfig contains the controller configuration @@ -35,6 +40,8 @@ type ControllerConfig struct { EnableNetworkPolicy bool // ConntrackCacheCleanupPeriod specifies the cleanup period ConntrackCacheCleanupPeriod int + // ConntrackTableSize specifies the conntrack table size for the agent + ConntrackCacheTableSize int // Configurations for the Controller Runtime RuntimeConfig RuntimeConfig } @@ -52,6 +59,17 @@ func (cfg *ControllerConfig) BindFlags(fs *pflag.FlagSet) { fs.BoolVar(&cfg.EnableNetworkPolicy, flagEnableNetworkPolicy, false, "If enabled, Network Policy agent will initialize BPF maps and start reconciler") fs.IntVar(&cfg.ConntrackCacheCleanupPeriod, flagConntrackCacheCleanupPeriod, defaultConntrackCacheCleanupPeriod, ""+ "Cleanup interval for network policy agent conntrack cache") + fs.IntVar(&cfg.ConntrackCacheTableSize, flagConntrackCacheTableSize, defaultConntrackCacheTableSize, ""+ + "Table size for network policy agent conntrack cache") cfg.RuntimeConfig.BindFlags(fs) } + +// Validate controller flags +func (cfg *ControllerConfig) ValidControllerFlags() error { + // Validate conntrack cache table size + if cfg.ConntrackCacheTableSize < (32*1024) || cfg.ConntrackCacheTableSize > (1024*1024) { + return errors.New("Invalid conntrack cache table size, should be between 32K and 1024K") + } + return nil +} diff --git a/pkg/ebpf/bpf_client.go b/pkg/ebpf/bpf_client.go index 293bd1e..5b63fc9 100644 --- a/pkg/ebpf/bpf_client.go +++ b/pkg/ebpf/bpf_client.go @@ -109,7 +109,7 @@ type EbpfFirewallRules struct { } func NewBpfClient(policyEndpointeBPFContext *sync.Map, nodeIP string, enablePolicyEventLogs, enableCloudWatchLogs bool, - enableIPv6 bool, conntrackTTL int) (*bpfClient, error) { + enableIPv6 bool, conntrackTTL int, conntrackTableSize int) (*bpfClient, error) { var conntrackMap goebpfmaps.BpfMap ebpfClient := &bpfClient{ @@ -181,10 +181,19 @@ func NewBpfClient(policyEndpointeBPFContext *sync.Map, nodeIP string, enablePoli if enableIPv6 { eventsProbe = EVENTS_V6_BINARY } - _, globalMapInfo, err := ebpfClient.bpfSDKClient.LoadBpfFile(eventsProbe, "global") + var bpfSdkInputData goelf.BpfCustomData + bpfSdkInputData.FilePath = eventsProbe + bpfSdkInputData.CustomPinPath = "global" + bpfSdkInputData.CustomMapSize = make(map[string]int) + + bpfSdkInputData.CustomMapSize[AWS_CONNTRACK_MAP] = conntrackTableSize + + ebpfClient.logger.Info("Setting conntrack cache map size: ", "max entries", conntrackTableSize) + + _, globalMapInfo, err := ebpfClient.bpfSDKClient.LoadBpfFileWithCustomData(bpfSdkInputData) if err != nil { ebpfClient.logger.Error(err, "Unable to load events binary. Required for policy enforcement, exiting..") - sdkAPIErr.WithLabelValues("LoadBpfFile").Inc() + sdkAPIErr.WithLabelValues("LoadBpfFileWithCustomData").Inc() return nil, err } ebpfClient.logger.Info("Successfully loaded events probe") From 94e8dcda17137366a72af4599c08fef8ddc74d76 Mon Sep 17 00:00:00 2001 From: Jayanth Varavani <1111446+jayanthvn@users.noreply.github.com> Date: Mon, 24 Jun 2024 19:30:06 +0000 Subject: [PATCH 2/3] formating --- main.go | 4 ++-- pkg/config/controller_config.go | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/main.go b/main.go index 0bfe158..f2666ac 100644 --- a/main.go +++ b/main.go @@ -83,8 +83,8 @@ func main() { os.Exit(1) } - err = ctrlConfig.ValidControllerFlags() - if err != nil{ + err = ctrlConfig.ValidControllerFlags() + if err != nil { setupLog.Error(err, "Controller flags validation failed") os.Exit(1) } diff --git a/pkg/config/controller_config.go b/pkg/config/controller_config.go index 1feb1fe..e66f70d 100644 --- a/pkg/config/controller_config.go +++ b/pkg/config/controller_config.go @@ -1,8 +1,9 @@ package config import ( - "github.com/spf13/pflag" "errors" + + "github.com/spf13/pflag" ) const ( From 6292bee9daa39096602bb1290d34c92372451333 Mon Sep 17 00:00:00 2001 From: Jayanth Varavani <1111446+jayanthvn@users.noreply.github.com> Date: Tue, 25 Jun 2024 20:06:49 +0000 Subject: [PATCH 3/3] readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 223cba7..bd56c2f 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ Type: Integer Default: 1024 * 256 -Network Policy agent maintains a local conntrack cache. Ideally this should be of the same size as kernel conntrack table. Note, this should be configured on new nodes before enabling network policy or if network policy is already enabled the change in configuration would need a reload of the nodes. Dynamic update of conntrack map size would lead to traffic disruption hence we won't support it. The value supported is between 32K and 1024K. +Network Policy agent maintains a local conntrack cache. Ideally this should be of the same size as kernel conntrack table. Note, this should be configured on new nodes before enabling network policy or if network policy is already enabled the change in configuration would need a reload of the nodes. Dynamic update of conntrack map size would lead to traffic disruption and isn't supported. The value supported is between 32K and 1024K. ## Network Policy Agent CLI