From 01c08d67b62ce7bbea3760ef4b5c72c729d612d6 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Thu, 17 Aug 2023 16:53:06 +0300 Subject: [PATCH 1/6] Rename nfd-topology-gc to nfd-gc This is preparation for making it a generic garbage collector for all nfd-managed api objects. --- cmd/{nfd-topology-gc => nfd-gc}/main.go | 18 ++++++------ cmd/{nfd-topology-gc => nfd-gc}/main_test.go | 0 .../{topology-gc.yaml => nfd-gc.yaml} | 0 .../nfd-nrt-gc.go => nfd-gc/nfd-gc.go} | 28 +++++++++---------- .../nfd-gc_test.go} | 6 ++-- 5 files changed, 26 insertions(+), 26 deletions(-) rename cmd/{nfd-topology-gc => nfd-gc}/main.go (74%) rename cmd/{nfd-topology-gc => nfd-gc}/main_test.go (100%) rename deployment/helm/node-feature-discovery/templates/{topology-gc.yaml => nfd-gc.yaml} (100%) rename pkg/{nfd-topology-gc/nfd-nrt-gc.go => nfd-gc/nfd-gc.go} (85%) rename pkg/{nfd-topology-gc/nfd-nrt-gc_test.go => nfd-gc/nfd-gc_test.go} (97%) diff --git a/cmd/nfd-topology-gc/main.go b/cmd/nfd-gc/main.go similarity index 74% rename from cmd/nfd-topology-gc/main.go rename to cmd/nfd-gc/main.go index 99626a489a..a47488d164 100644 --- a/cmd/nfd-topology-gc/main.go +++ b/cmd/nfd-gc/main.go @@ -24,13 +24,13 @@ import ( "k8s.io/klog/v2" - nfdtopologygarbagecollector "sigs.k8s.io/node-feature-discovery/pkg/nfd-topology-gc" + nfdgarbagecollector "sigs.k8s.io/node-feature-discovery/pkg/nfd-gc" "sigs.k8s.io/node-feature-discovery/pkg/version" ) const ( // ProgramName is the canonical name of this program - ProgramName = "nfd-topology-gc" + ProgramName = "nfd-gc" ) func main() { @@ -50,10 +50,10 @@ func main() { klog.InfoS("version not set! Set -ldflags \"-X sigs.k8s.io/node-feature-discovery/pkg/version.version=`git describe --tags --dirty --always`\" during build or run.") } - // Get new TopologyGC instance - gc, err := nfdtopologygarbagecollector.New(args) + // Get new garbage collector instance + gc, err := nfdgarbagecollector.New(args) if err != nil { - klog.ErrorS(err, "failed to initialize topology garbage collector instance") + klog.ErrorS(err, "failed to initialize nfd garbage collector instance") os.Exit(1) } @@ -63,7 +63,7 @@ func main() { } } -func parseArgs(flags *flag.FlagSet, osArgs ...string) *nfdtopologygarbagecollector.Args { +func parseArgs(flags *flag.FlagSet, osArgs ...string) *nfdgarbagecollector.Args { args := initFlags(flags) _ = flags.Parse(osArgs) @@ -76,11 +76,11 @@ func parseArgs(flags *flag.FlagSet, osArgs ...string) *nfdtopologygarbagecollect return args } -func initFlags(flagset *flag.FlagSet) *nfdtopologygarbagecollector.Args { - args := &nfdtopologygarbagecollector.Args{} +func initFlags(flagset *flag.FlagSet) *nfdgarbagecollector.Args { + args := &nfdgarbagecollector.Args{} flagset.DurationVar(&args.GCPeriod, "gc-interval", time.Duration(1)*time.Hour, - "Interval between which Garbage Collector will try to cleanup any missed but already obsolete NodeResourceTopology. [Default: 1h]") + "interval between cleanup of obsolete api objects") flagset.StringVar(&args.Kubeconfig, "kubeconfig", "", "Kubeconfig to use") diff --git a/cmd/nfd-topology-gc/main_test.go b/cmd/nfd-gc/main_test.go similarity index 100% rename from cmd/nfd-topology-gc/main_test.go rename to cmd/nfd-gc/main_test.go diff --git a/deployment/helm/node-feature-discovery/templates/topology-gc.yaml b/deployment/helm/node-feature-discovery/templates/nfd-gc.yaml similarity index 100% rename from deployment/helm/node-feature-discovery/templates/topology-gc.yaml rename to deployment/helm/node-feature-discovery/templates/nfd-gc.yaml diff --git a/pkg/nfd-topology-gc/nfd-nrt-gc.go b/pkg/nfd-gc/nfd-gc.go similarity index 85% rename from pkg/nfd-topology-gc/nfd-nrt-gc.go rename to pkg/nfd-gc/nfd-gc.go index 2ec9de8156..22bb95f2ca 100644 --- a/pkg/nfd-topology-gc/nfd-nrt-gc.go +++ b/pkg/nfd-gc/nfd-gc.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package nfdtopologygarbagecollector +package nfdgarbagecollector import ( "context" @@ -42,19 +42,19 @@ type Args struct { Kubeconfig string } -type TopologyGC interface { +type NfdGarbageCollector interface { Run() error Stop() } -type topologyGC struct { +type nfdGarbageCollector struct { stopChan chan struct{} topoClient topologyclientset.Interface gcPeriod time.Duration factory informers.SharedInformerFactory } -func New(args *Args) (TopologyGC, error) { +func New(args *Args) (NfdGarbageCollector, error) { kubeconfig, err := apihelper.GetKubeconfig(args.Kubeconfig) if err != nil { return nil, err @@ -62,10 +62,10 @@ func New(args *Args) (TopologyGC, error) { stop := make(chan struct{}) - return newTopologyGC(kubeconfig, stop, args.GCPeriod) + return newNfdGarbageCollector(kubeconfig, stop, args.GCPeriod) } -func newTopologyGC(config *restclient.Config, stop chan struct{}, gcPeriod time.Duration) (*topologyGC, error) { +func newNfdGarbageCollector(config *restclient.Config, stop chan struct{}, gcPeriod time.Duration) (*nfdGarbageCollector, error) { helper := apihelper.K8sHelpers{Kubeconfig: config} cli, err := helper.GetTopologyClient() if err != nil { @@ -75,7 +75,7 @@ func newTopologyGC(config *restclient.Config, stop chan struct{}, gcPeriod time. clientset := kubernetes.NewForConfigOrDie(config) factory := informers.NewSharedInformerFactory(clientset, 5*time.Minute) - return &topologyGC{ + return &nfdGarbageCollector{ topoClient: cli, stopChan: stop, gcPeriod: gcPeriod, @@ -83,7 +83,7 @@ func newTopologyGC(config *restclient.Config, stop chan struct{}, gcPeriod time. }, nil } -func (n *topologyGC) deleteNRT(nodeName string) { +func (n *nfdGarbageCollector) deleteNRT(nodeName string) { if err := n.topoClient.TopologyV1alpha2().NodeResourceTopologies().Delete(context.TODO(), nodeName, metav1.DeleteOptions{}); err != nil { if errors.IsNotFound(err) { klog.V(2).InfoS("NodeResourceTopology not found, omitting deletion", "nodeName", nodeName) @@ -96,7 +96,7 @@ func (n *topologyGC) deleteNRT(nodeName string) { klog.InfoS("NodeResourceTopology object has been deleted", "nodeName", nodeName) } -func (n *topologyGC) deleteNodeHandler(object interface{}) { +func (n *nfdGarbageCollector) deleteNodeHandler(object interface{}) { // handle a case when we are starting up and need to clear stale NRT resources obj := object if deletedFinalStateUnknown, ok := object.(cache.DeletedFinalStateUnknown); ok { @@ -114,7 +114,7 @@ func (n *topologyGC) deleteNodeHandler(object interface{}) { } // garbageCollect removes all stale API objects -func (n *topologyGC) garbageCollect() { +func (n *nfdGarbageCollector) garbageCollect() { klog.InfoS("performing garbage collection") nodes, err := n.factory.Core().V1().Nodes().Lister().List(labels.Everything()) if err != nil { @@ -145,7 +145,7 @@ func (n *topologyGC) garbageCollect() { } // periodicGC runs garbage collector at every gcPeriod to make sure we haven't missed any node -func (n *topologyGC) periodicGC(gcPeriod time.Duration) { +func (n *nfdGarbageCollector) periodicGC(gcPeriod time.Duration) { // Do initial round of garbage collection at startup time n.garbageCollect() @@ -162,7 +162,7 @@ func (n *topologyGC) periodicGC(gcPeriod time.Duration) { } } -func (n *topologyGC) startNodeInformer() error { +func (n *nfdGarbageCollector) startNodeInformer() error { nodeInformer := n.factory.Core().V1().Nodes().Informer() if _, err := nodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ @@ -179,7 +179,7 @@ func (n *topologyGC) startNodeInformer() error { } // Run is a blocking function that removes stale NRT objects when Node is deleted and runs periodic GC to make sure any obsolete objects are removed -func (n *topologyGC) Run() error { +func (n *nfdGarbageCollector) Run() error { if err := n.startNodeInformer(); err != nil { return err } @@ -189,6 +189,6 @@ func (n *topologyGC) Run() error { return nil } -func (n *topologyGC) Stop() { +func (n *nfdGarbageCollector) Stop() { close(n.stopChan) } diff --git a/pkg/nfd-topology-gc/nfd-nrt-gc_test.go b/pkg/nfd-gc/nfd-gc_test.go similarity index 97% rename from pkg/nfd-topology-gc/nfd-nrt-gc_test.go rename to pkg/nfd-gc/nfd-gc_test.go index 746be8694f..58af2b6b07 100644 --- a/pkg/nfd-topology-gc/nfd-nrt-gc_test.go +++ b/pkg/nfd-gc/nfd-gc_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package nfdtopologygarbagecollector +package nfdgarbagecollector import ( "context" @@ -93,7 +93,7 @@ func TestNRTGC(t *testing.T) { func newMockGC(nodes, nrts []string) *mockGC { k8sClient := fakek8sclientset.NewSimpleClientset(createFakeNodes(nodes...)...) return &mockGC{ - topologyGC: topologyGC{ + nfdGarbageCollector: nfdGarbageCollector{ factory: informers.NewSharedInformerFactory(k8sClient, 5*time.Minute), topoClient: faketopologyv1alpha2.NewSimpleClientset(createFakeNRTs(nrts...)...), stopChan: make(chan struct{}, 1), @@ -126,7 +126,7 @@ func createFakeNRTs(names ...string) []runtime.Object { } type mockGC struct { - topologyGC + nfdGarbageCollector k8sClient k8sclientset.Interface } From e3415ec484422965025b771ffe34c057ebe76669 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Thu, 17 Aug 2023 20:01:14 +0300 Subject: [PATCH 2/6] nfd-gc: support garbage collection of NodeFeatures Hook into the same logic already exercised for NodeResourceTopology objects: GC watches for node delete events and immediately drops stale objects (NRT and now also NF). In addition there is a periodic resync to catch any missed node deletes, once every hour by default. --- pkg/nfd-gc/nfd-gc.go | 66 ++++++++++++++++++++++++++++++++------- pkg/nfd-gc/nfd-gc_test.go | 2 ++ 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/pkg/nfd-gc/nfd-gc.go b/pkg/nfd-gc/nfd-gc.go index 22bb95f2ca..b98c7f9381 100644 --- a/pkg/nfd-gc/nfd-gc.go +++ b/pkg/nfd-gc/nfd-gc.go @@ -33,6 +33,8 @@ import ( "k8s.io/klog/v2" "sigs.k8s.io/node-feature-discovery/pkg/apihelper" + nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1" + nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned" ) // Args are the command line arguments @@ -49,6 +51,7 @@ type NfdGarbageCollector interface { type nfdGarbageCollector struct { stopChan chan struct{} + nfdClient nfdclientset.Interface topoClient topologyclientset.Interface gcPeriod time.Duration factory informers.SharedInformerFactory @@ -77,12 +80,26 @@ func newNfdGarbageCollector(config *restclient.Config, stop chan struct{}, gcPer return &nfdGarbageCollector{ topoClient: cli, + nfdClient: nfdclientset.NewForConfigOrDie(config), stopChan: stop, gcPeriod: gcPeriod, factory: factory, }, nil } +func (n *nfdGarbageCollector) deleteNodeFeature(namespace, name string) { + if err := n.nfdClient.NfdV1alpha1().NodeFeatures(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}); err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("NodeFeature not found, omitting deletion", "nodefeature", klog.KRef(namespace, name)) + return + } else { + klog.ErrorS(err, "failed to delete NodeFeature object", "nodefeature", klog.KRef(namespace, name)) + return + } + } + klog.InfoS("NodeFeature object has been deleted", "nodefeature", klog.KRef(namespace, name)) +} + func (n *nfdGarbageCollector) deleteNRT(nodeName string) { if err := n.topoClient.TopologyV1alpha2().NodeResourceTopologies().Delete(context.TODO(), nodeName, metav1.DeleteOptions{}); err != nil { if errors.IsNotFound(err) { @@ -111,6 +128,16 @@ func (n *nfdGarbageCollector) deleteNodeHandler(object interface{}) { } n.deleteNRT(node.GetName()) + + // Delete all NodeFeature objects (from all namespaces) targeting the deleted node + nfListOptions := metav1.ListOptions{LabelSelector: nfdv1alpha1.NodeFeatureObjNodeNameLabel + "=" + node.GetName()} + if nfs, err := n.nfdClient.NfdV1alpha1().NodeFeatures("").List(context.TODO(), nfListOptions); err != nil { + klog.ErrorS(err, "failed to list NodeFeature objects") + } else { + for _, nf := range nfs.Items { + n.deleteNodeFeature(nf.Namespace, nf.Name) + } + } } // garbageCollect removes all stale API objects @@ -126,20 +153,35 @@ func (n *nfdGarbageCollector) garbageCollect() { nodeNames.Insert(node.Name) } - nrts, err := n.topoClient.TopologyV1alpha2().NodeResourceTopologies().List(context.TODO(), metav1.ListOptions{}) - if err != nil { - klog.ErrorS(err, "failed to list NodeResourceTopology objects") - return + // Handle NodeFeature objects + nfs, err := n.nfdClient.NfdV1alpha1().NodeFeatures("").List(context.TODO(), metav1.ListOptions{}) + if errors.IsNotFound(err) { + klog.V(2).InfoS("NodeFeature CRD does not exist") + } else if err != nil { + klog.ErrorS(err, "failed to list NodeFeature objects") + } else { + for _, nf := range nfs.Items { + nodeName, ok := nf.GetLabels()[nfdv1alpha1.NodeFeatureObjNodeNameLabel] + if !ok { + klog.InfoS("node name label missing from NodeFeature object", "nodefeature", klog.KObj(&nf)) + } + if !nodeNames.Has(nodeName) { + n.deleteNodeFeature(nf.Namespace, nf.Name) + } + } } - for _, nrt := range nrts.Items { - key, err := cache.MetaNamespaceKeyFunc(&nrt) - if err != nil { - klog.ErrorS(err, "failed to create key", "noderesourcetopology", klog.KObj(&nrt)) - continue - } - if !nodeNames.Has(key) { - n.deleteNRT(key) + // Handle NodeResourceTopology objects + nrts, err := n.topoClient.TopologyV1alpha2().NodeResourceTopologies().List(context.TODO(), metav1.ListOptions{}) + if errors.IsNotFound(err) { + klog.V(2).InfoS("NodeResourceTopology CRD does not exist") + } else if err != nil { + klog.ErrorS(err, "failed to list NodeResourceTopology objects") + } else { + for _, nrt := range nrts.Items { + if !nodeNames.Has(nrt.Name) { + n.deleteNRT(nrt.Name) + } } } } diff --git a/pkg/nfd-gc/nfd-gc_test.go b/pkg/nfd-gc/nfd-gc_test.go index 58af2b6b07..a1c9310d6e 100644 --- a/pkg/nfd-gc/nfd-gc_test.go +++ b/pkg/nfd-gc/nfd-gc_test.go @@ -31,6 +31,7 @@ import ( "k8s.io/client-go/informers" k8sclientset "k8s.io/client-go/kubernetes" fakek8sclientset "k8s.io/client-go/kubernetes/fake" + fakenfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned/fake" . "github.com/smartystreets/goconvey/convey" ) @@ -95,6 +96,7 @@ func newMockGC(nodes, nrts []string) *mockGC { return &mockGC{ nfdGarbageCollector: nfdGarbageCollector{ factory: informers.NewSharedInformerFactory(k8sClient, 5*time.Minute), + nfdClient: fakenfdclientset.NewSimpleClientset(), topoClient: faketopologyv1alpha2.NewSimpleClientset(createFakeNRTs(nrts...)...), stopChan: make(chan struct{}, 1), gcPeriod: 10 * time.Minute, From f9fadd2102d8d214953d26c9bbaa048485c1d397 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Thu, 17 Aug 2023 21:44:50 +0300 Subject: [PATCH 3/6] test/e2e: add e2e test for nfd-gc --- test/e2e/nfd_gc_test.go | 199 ++++++++++++++++++++++++ test/e2e/utils/crd.go | 12 ++ test/e2e/utils/deployment/deployment.go | 50 ++++++ test/e2e/utils/noderesourcetopology.go | 10 ++ test/e2e/utils/pod/pod.go | 32 ++++ test/e2e/utils/rbac.go | 77 +++++++++ 6 files changed, 380 insertions(+) create mode 100644 test/e2e/nfd_gc_test.go create mode 100644 test/e2e/utils/deployment/deployment.go diff --git a/test/e2e/nfd_gc_test.go b/test/e2e/nfd_gc_test.go new file mode 100644 index 0000000000..babdbbed9c --- /dev/null +++ b/test/e2e/nfd_gc_test.go @@ -0,0 +1,199 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + topologyv1alpha2 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2" + topologyclient "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/clientset/versioned" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + extclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/kubernetes/test/e2e/framework" + "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1" + nfdclient "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned" + "sigs.k8s.io/node-feature-discovery/test/e2e/utils" + testutils "sigs.k8s.io/node-feature-discovery/test/e2e/utils" + testdeploy "sigs.k8s.io/node-feature-discovery/test/e2e/utils/deployment" + testpod "sigs.k8s.io/node-feature-discovery/test/e2e/utils/pod" +) + +// Actual test suite +var _ = SIGDescribe("NFD GC", func() { + f := framework.NewDefaultFramework("nfd-gc") + + Context("when deploying nfd-gc", Ordered, func() { + var ( + crds []*apiextensionsv1.CustomResourceDefinition + extClient *extclient.Clientset + nfdClient *nfdclient.Clientset + topologyClient *topologyclient.Clientset + ) + + BeforeAll(func(ctx context.Context) { + // Create clients for apiextensions and our CRD api + extClient = extclient.NewForConfigOrDie(f.ClientConfig()) + nfdClient = nfdclient.NewForConfigOrDie(f.ClientConfig()) + topologyClient = topologyclient.NewForConfigOrDie(f.ClientConfig()) + + By("Creating CRDs") + var err error + crds, err = testutils.CreateNfdCRDs(ctx, extClient) + Expect(err).NotTo(HaveOccurred()) + crd, err := testutils.CreateNodeResourceTopologies(ctx, extClient) + Expect(err).NotTo(HaveOccurred()) + crds = append(crds, crd) + }) + + AfterAll(func(ctx context.Context) { + for _, crd := range crds { + err := extClient.ApiextensionsV1().CustomResourceDefinitions().Delete(ctx, crd.Name, metav1.DeleteOptions{}) + Expect(err).NotTo(HaveOccurred()) + } + }) + + JustBeforeEach(func(ctx context.Context) { + err := testutils.ConfigureRBAC(ctx, f.ClientSet, f.Namespace.Name) + Expect(err).NotTo(HaveOccurred()) + cleanupCRs(ctx, nfdClient, f.Namespace.Name) + cleanupNRTs(ctx, topologyClient) + }) + + AfterEach(func(ctx context.Context) { + Expect(testutils.DeconfigureRBAC(ctx, f.ClientSet, f.Namespace.Name)).NotTo(HaveOccurred()) + cleanupCRs(ctx, nfdClient, f.Namespace.Name) + cleanupNRTs(ctx, topologyClient) + }) + + // Helper functions + createCRs := func(ctx context.Context, nodeNames []string) error { + for _, name := range nodeNames { + if err := utils.CreateNodeFeature(ctx, nfdClient, f.Namespace.Name, name, name); err != nil { + return err + } + if err := utils.CreateNodeResourceTopology(ctx, topologyClient, name); err != nil { + return err + } + framework.Logf("CREATED CRS FOR node %q", name) + } + return nil + } + + getNodeFeatures := func(ctx context.Context) ([]v1alpha1.NodeFeature, error) { + nfl, err := nfdClient.NfdV1alpha1().NodeFeatures("").List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, err + } + return nfl.Items, nil + } + + getNodeResourceTopologies := func(ctx context.Context) ([]topologyv1alpha2.NodeResourceTopology, error) { + nrtl, err := topologyClient.TopologyV1alpha2().NodeResourceTopologies().List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, err + } + return nrtl.Items, nil + } + + // + // Test GC at startup + // + Context("with pre-existing NodeFeature and NodeResourceTopology objects", func() { + It("it should delete stale objects at startup", func(ctx context.Context) { + nodes, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + Expect(err).NotTo(HaveOccurred()) + targetNodeNames := []string{nodes.Items[0].GetName(), nodes.Items[len(nodes.Items)-1].GetName()} + staleNodeNames := []string{"non-existent-node-1", "non-existent-node-2"} + + // Create NodeFeature and NodeResourceTopology objects + By("Creating CRs") + Expect(createCRs(ctx, targetNodeNames)).NotTo(HaveOccurred()) + Expect(createCRs(ctx, staleNodeNames)).NotTo(HaveOccurred()) + + // Deploy nfd-gc + By("Creating nfd-gc deployment") + podSpecOpts := []testpod.SpecOption{testpod.SpecWithContainerImage(dockerImage())} + gcDeploy := testdeploy.NFDGC(podSpecOpts...) + gcDeploy, err = f.ClientSet.AppsV1().Deployments(f.Namespace.Name).Create(ctx, gcDeploy, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + + By("Waiting for gc deployment pods to be ready") + Expect(testpod.WaitForReady(ctx, f.ClientSet, f.Namespace.Name, gcDeploy.Spec.Template.Labels["name"], 2)).NotTo(HaveOccurred()) + + // Check that only expected objects exist + By("Verifying CRs") + Eventually(getNodeFeatures).WithPolling(1 * time.Second).WithTimeout(3 * time.Second).WithContext(ctx).Should(ConsistOf(haveNames(targetNodeNames...)...)) + Eventually(getNodeResourceTopologies).WithPolling(1 * time.Second).WithTimeout(3 * time.Second).WithContext(ctx).Should(ConsistOf(haveNames(targetNodeNames...)...)) + }) + }) + + // + // Test periodic GC + // + Context("with stale NodeFeature and NodeResourceTopology objects appearing", func() { + It("it should remove stale objects", func(ctx context.Context) { + nodes, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + Expect(err).NotTo(HaveOccurred()) + targetNodeNames := []string{nodes.Items[0].GetName(), nodes.Items[len(nodes.Items)-1].GetName()} + staleNodeNames := []string{"non-existent-node-2.1", "non-existent-node-2.2"} + + // Deploy nfd-gc + By("Creating nfd-gc deployment") + podSpecOpts := []testpod.SpecOption{ + testpod.SpecWithContainerImage(dockerImage()), + testpod.SpecWithContainerExtraArgs("-gc-interval", "1s"), + } + gcDeploy := testdeploy.NFDGC(podSpecOpts...) + gcDeploy, err = f.ClientSet.AppsV1().Deployments(f.Namespace.Name).Create(ctx, gcDeploy, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + + By("Waiting for gc deployment pods to be ready") + Expect(testpod.WaitForReady(ctx, f.ClientSet, f.Namespace.Name, gcDeploy.Spec.Template.Labels["name"], 2)).NotTo(HaveOccurred()) + + // Create NodeFeature and NodeResourceTopology objects + By("Creating CRs") + Expect(createCRs(ctx, targetNodeNames)).NotTo(HaveOccurred()) + Expect(createCRs(ctx, staleNodeNames)).NotTo(HaveOccurred()) + + // Check that only expected objects exist + By("Verifying CRs") + Eventually(getNodeFeatures).WithPolling(1 * time.Second).WithTimeout(3 * time.Second).WithContext(ctx).Should(ConsistOf(haveNames(targetNodeNames...)...)) + Eventually(getNodeResourceTopologies).WithPolling(1 * time.Second).WithTimeout(3 * time.Second).WithContext(ctx).Should(ConsistOf(haveNames(targetNodeNames...)...)) + }) + }) + }) +}) + +// haveNames is a helper that returns a slice of Gomega matchers for asserting the names of k8s API objects +func haveNames(names ...string) []interface{} { + m := make([]interface{}, len(names)) + for i, n := range names { + m[i] = HaveField("Name", n) + } + return m +} + +func cleanupNRTs(ctx context.Context, cli *topologyclient.Clientset) { + By("Deleting NodeResourceTopology objects from the cluster") + err := cli.TopologyV1alpha2().NodeResourceTopologies().DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{}) + Expect(err).NotTo(HaveOccurred()) +} diff --git a/test/e2e/utils/crd.go b/test/e2e/utils/crd.go index 34c1161aa8..0f36f54abe 100644 --- a/test/e2e/utils/crd.go +++ b/test/e2e/utils/crd.go @@ -144,6 +144,18 @@ func UpdateNodeFeatureRulesFromFile(ctx context.Context, cli nfdclientset.Interf return nil } +// CreateNodeFeature creates a dummy NodeFeature object for a node +func CreateNodeFeature(ctx context.Context, cli nfdclientset.Interface, namespace, name, nodeName string) error { + nr := &nfdv1alpha1.NodeFeature{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{nfdv1alpha1.NodeFeatureObjNodeNameLabel: nodeName}, + }, + } + _, err := cli.NfdV1alpha1().NodeFeatures(namespace).Create(ctx, nr, metav1.CreateOptions{}) + return err +} + func apiObjsFromFile(path string, decoder apiruntime.Decoder) ([]apiruntime.Object, error) { data, err := os.ReadFile(path) if err != nil { diff --git a/test/e2e/utils/deployment/deployment.go b/test/e2e/utils/deployment/deployment.go new file mode 100644 index 0000000000..d090251060 --- /dev/null +++ b/test/e2e/utils/deployment/deployment.go @@ -0,0 +1,50 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package daemonset + +import ( + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "sigs.k8s.io/node-feature-discovery/test/e2e/utils/pod" +) + +// NFDGC returns a deplooyment for NFD Garbage Collector +func NFDGC(opts ...pod.SpecOption) *appsv1.Deployment { + return new("nfd-gc", pod.NFDGCSpec(opts...)) +} + +func new(name string, podSpec *corev1.PodSpec) *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"name": name}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"name": name}, + }, + Spec: *podSpec, + }, + MinReadySeconds: 1, + }, + } +} diff --git a/test/e2e/utils/noderesourcetopology.go b/test/e2e/utils/noderesourcetopology.go index a0521d27fa..b6d7ebae03 100644 --- a/test/e2e/utils/noderesourcetopology.go +++ b/test/e2e/utils/noderesourcetopology.go @@ -107,6 +107,16 @@ func CreateNodeResourceTopologies(ctx context.Context, extClient extclient.Inter return extClient.ApiextensionsV1().CustomResourceDefinitions().Create(ctx, crd, metav1.CreateOptions{}) } +// CreateNodeResourceTopology creates a dummy NodeResourceTopology object for a node +func CreateNodeResourceTopology(ctx context.Context, topologyClient *topologyclientset.Clientset, nodeName string) error { + nrt := &v1alpha2.NodeResourceTopology{ + ObjectMeta: metav1.ObjectMeta{Name: nodeName}, + Zones: v1alpha2.ZoneList{}, + } + _, err := topologyClient.TopologyV1alpha2().NodeResourceTopologies().Create(ctx, nrt, metav1.CreateOptions{}) + return err +} + // GetNodeTopology returns the NodeResourceTopology data for the node identified by `nodeName`. func GetNodeTopology(ctx context.Context, topologyClient *topologyclientset.Clientset, nodeName string) *v1alpha2.NodeResourceTopology { var nodeTopology *v1alpha2.NodeResourceTopology diff --git a/test/e2e/utils/pod/pod.go b/test/e2e/utils/pod/pod.go index c22f12b657..93b851095a 100644 --- a/test/e2e/utils/pod/pod.go +++ b/test/e2e/utils/pod/pod.go @@ -355,6 +355,38 @@ func nfdWorkerSpec(opts ...SpecOption) *corev1.PodSpec { return p } +func NFDGCSpec(opts ...SpecOption) *corev1.PodSpec { + yes := true + no := false + p := &corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "node-feature-discovery", + ImagePullPolicy: pullPolicy(), + Command: []string{"nfd-gc"}, + SecurityContext: &corev1.SecurityContext{ + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + Privileged: &no, + RunAsNonRoot: &yes, + ReadOnlyRootFilesystem: &yes, + AllowPrivilegeEscalation: &no, + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + }, + }, + ServiceAccountName: "nfd-gc-e2e", + } + + for _, o := range opts { + o(p) + } + return p +} + func NFDTopologyUpdaterSpec(kc utils.KubeletConfig, opts ...SpecOption) *corev1.PodSpec { p := &corev1.PodSpec{ Containers: []corev1.Container{ diff --git a/test/e2e/utils/rbac.go b/test/e2e/utils/rbac.go index 74488f800d..8549256aa3 100644 --- a/test/e2e/utils/rbac.go +++ b/test/e2e/utils/rbac.go @@ -42,6 +42,11 @@ func ConfigureRBAC(ctx context.Context, cs clientset.Interface, ns string) error return err } + _, err = createServiceAccount(ctx, cs, "nfd-gc-e2e", ns) + if err != nil { + return err + } + _, err = createServiceAccount(ctx, cs, "nfd-topology-updater-e2e", ns) if err != nil { return err @@ -57,6 +62,11 @@ func ConfigureRBAC(ctx context.Context, cs clientset.Interface, ns string) error return err } + _, err = createClusterRoleGC(ctx, cs) + if err != nil { + return err + } + _, err = createClusterRoleTopologyUpdater(ctx, cs) if err != nil { return err @@ -72,6 +82,11 @@ func ConfigureRBAC(ctx context.Context, cs clientset.Interface, ns string) error return err } + _, err = createClusterRoleBindingGC(ctx, cs, ns) + if err != nil { + return err + } + _, err = createClusterRoleBindingTopologyUpdater(ctx, cs, ns) if err != nil { return err @@ -94,6 +109,10 @@ func DeconfigureRBAC(ctx context.Context, cs clientset.Interface, ns string) err if err != nil { return err } + err = cs.RbacV1().ClusterRoleBindings().Delete(ctx, "nfd-gc-e2e", metav1.DeleteOptions{}) + if err != nil { + return err + } err = cs.RbacV1().ClusterRoles().Delete(ctx, "nfd-topology-updater-e2e", metav1.DeleteOptions{}) if err != nil { return err @@ -106,6 +125,10 @@ func DeconfigureRBAC(ctx context.Context, cs clientset.Interface, ns string) err if err != nil { return err } + err = cs.RbacV1().ClusterRoles().Delete(ctx, "nfd-gc-e2e", metav1.DeleteOptions{}) + if err != nil { + return err + } err = cs.CoreV1().ServiceAccounts(ns).Delete(ctx, "nfd-topology-updater-e2e", metav1.DeleteOptions{}) if err != nil { return err @@ -118,6 +141,10 @@ func DeconfigureRBAC(ctx context.Context, cs clientset.Interface, ns string) err if err != nil { return err } + err = cs.CoreV1().ServiceAccounts(ns).Delete(ctx, "nfd-gc-e2e", metav1.DeleteOptions{}) + if err != nil { + return err + } return nil } @@ -182,6 +209,33 @@ func createRoleWorker(ctx context.Context, cs clientset.Interface, ns string) (* return cs.RbacV1().Roles(ns).Update(ctx, cr, metav1.UpdateOptions{}) } +// Configure cluster role required by NFD GC +func createClusterRoleGC(ctx context.Context, cs clientset.Interface) (*rbacv1.ClusterRole, error) { + cr := &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nfd-gc-e2e", + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"nodes"}, + Verbs: []string{"list", "watch"}, + }, + { + APIGroups: []string{"nfd.k8s-sigs.io"}, + Resources: []string{"nodefeatures"}, + Verbs: []string{"list", "delete"}, + }, + { + APIGroups: []string{"topology.node.k8s.io"}, + Resources: []string{"noderesourcetopologies"}, + Verbs: []string{"list", "delete"}, + }, + }, + } + return cs.RbacV1().ClusterRoles().Update(ctx, cr, metav1.UpdateOptions{}) +} + // Configure cluster role required by NFD Topology Updater func createClusterRoleTopologyUpdater(ctx context.Context, cs clientset.Interface) (*rbacv1.ClusterRole, error) { cr := &rbacv1.ClusterRole{ @@ -268,6 +322,29 @@ func createRoleBindingWorker(ctx context.Context, cs clientset.Interface, ns str return cs.RbacV1().RoleBindings(ns).Update(ctx, crb, metav1.UpdateOptions{}) } +// Configure cluster role binding required by NFD GC +func createClusterRoleBindingGC(ctx context.Context, cs clientset.Interface, ns string) (*rbacv1.ClusterRoleBinding, error) { + crb := &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nfd-gc-e2e", + }, + Subjects: []rbacv1.Subject{ + { + Kind: rbacv1.ServiceAccountKind, + Name: "nfd-gc-e2e", + Namespace: ns, + }, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: rbacv1.GroupName, + Kind: "ClusterRole", + Name: "nfd-gc-e2e", + }, + } + + return cs.RbacV1().ClusterRoleBindings().Update(ctx, crb, metav1.UpdateOptions{}) +} + // Configure cluster role binding required by NFD Topology Updater func createClusterRoleBindingTopologyUpdater(ctx context.Context, cs clientset.Interface, ns string) (*rbacv1.ClusterRoleBinding, error) { crb := &rbacv1.ClusterRoleBinding{ From 6cf29bd8eff5185ebfc8521c4b796782e92a5b3e Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Fri, 18 Aug 2023 12:49:20 +0300 Subject: [PATCH 4/6] deployment/kustomize: support nfd-gc Rename the old "topology-gc" to just "gc". Simplify the setup a bit by including the RBAC rules in the "gc" base. Note: we don't enable nfd-gc in the default overlay, yet, as the NodeFeature API isn't enabled (gc is not needed). --- .../gc-clusterrole.yaml} | 9 ++++++++- .../gc-clusterrolebinding.yaml} | 6 +++--- .../gc-serviceaccount.yaml} | 2 +- .../{topology-gc/topology-gc.yaml => gc/gc.yaml} | 12 ++++++------ .../base/{topology-gc => gc}/kustomization.yaml | 5 ++++- deployment/base/rbac-topology-gc/kustomization.yaml | 9 --------- .../master-worker-topologyupdater/kustomization.yaml | 3 +-- .../overlays/topologyupdater/kustomization.yaml | 3 +-- 8 files changed, 24 insertions(+), 25 deletions(-) rename deployment/base/{rbac-topology-gc/topology-gc-clusterrole.yaml => gc/gc-clusterrole.yaml} (74%) rename deployment/base/{rbac-topology-gc/topology-gc-clusterrolebinding.yaml => gc/gc-clusterrolebinding.yaml} (73%) rename deployment/base/{rbac-topology-gc/topology-gc-serviceaccount.yaml => gc/gc-serviceaccount.yaml} (65%) rename deployment/base/{topology-gc/topology-gc.yaml => gc/gc.yaml} (64%) rename deployment/base/{topology-gc => gc}/kustomization.yaml (56%) delete mode 100644 deployment/base/rbac-topology-gc/kustomization.yaml diff --git a/deployment/base/rbac-topology-gc/topology-gc-clusterrole.yaml b/deployment/base/gc/gc-clusterrole.yaml similarity index 74% rename from deployment/base/rbac-topology-gc/topology-gc-clusterrole.yaml rename to deployment/base/gc/gc-clusterrole.yaml index c0f4314447..d4e776c2f2 100644 --- a/deployment/base/rbac-topology-gc/topology-gc-clusterrole.yaml +++ b/deployment/base/gc/gc-clusterrole.yaml @@ -1,7 +1,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: nfd-topology-gc + name: nfd-gc rules: - apiGroups: - "" @@ -23,3 +23,10 @@ rules: verbs: - delete - list +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeatures + verbs: + - delete + - list diff --git a/deployment/base/rbac-topology-gc/topology-gc-clusterrolebinding.yaml b/deployment/base/gc/gc-clusterrolebinding.yaml similarity index 73% rename from deployment/base/rbac-topology-gc/topology-gc-clusterrolebinding.yaml rename to deployment/base/gc/gc-clusterrolebinding.yaml index b8615d63c0..a0f40b6ed5 100644 --- a/deployment/base/rbac-topology-gc/topology-gc-clusterrolebinding.yaml +++ b/deployment/base/gc/gc-clusterrolebinding.yaml @@ -1,12 +1,12 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: nfd-topology-gc + name: nfd-gc roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: nfd-topology-gc + name: nfd-gc subjects: - kind: ServiceAccount - name: nfd-topology-gc + name: nfd-gc namespace: default diff --git a/deployment/base/rbac-topology-gc/topology-gc-serviceaccount.yaml b/deployment/base/gc/gc-serviceaccount.yaml similarity index 65% rename from deployment/base/rbac-topology-gc/topology-gc-serviceaccount.yaml rename to deployment/base/gc/gc-serviceaccount.yaml index e56f7bbefd..ec9501a896 100644 --- a/deployment/base/rbac-topology-gc/topology-gc-serviceaccount.yaml +++ b/deployment/base/gc/gc-serviceaccount.yaml @@ -1,4 +1,4 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: nfd-topology-gc + name: nfd-gc diff --git a/deployment/base/topology-gc/topology-gc.yaml b/deployment/base/gc/gc.yaml similarity index 64% rename from deployment/base/topology-gc/topology-gc.yaml rename to deployment/base/gc/gc.yaml index 07565e3a08..bbae4aa5c4 100644 --- a/deployment/base/topology-gc/topology-gc.yaml +++ b/deployment/base/gc/gc.yaml @@ -3,21 +3,21 @@ kind: Deployment metadata: labels: app: nfd - name: nfd-topology-gc + name: nfd-gc spec: selector: matchLabels: - app: nfd-topology-gc + app: nfd-gc template: metadata: labels: - app: nfd-topology-gc + app: nfd-gc spec: dnsPolicy: ClusterFirstWithHostNet - serviceAccount: nfd-topology-gc + serviceAccount: nfd-gc containers: - - name: nfd-topology-gc + - name: nfd-gc image: gcr.io/k8s-staging-nfd/node-feature-discovery:master imagePullPolicy: Always command: - - "nfd-topology-gc" + - "nfd-gc" diff --git a/deployment/base/topology-gc/kustomization.yaml b/deployment/base/gc/kustomization.yaml similarity index 56% rename from deployment/base/topology-gc/kustomization.yaml rename to deployment/base/gc/kustomization.yaml index 3d8da69b69..855c0a1a4b 100644 --- a/deployment/base/topology-gc/kustomization.yaml +++ b/deployment/base/gc/kustomization.yaml @@ -4,4 +4,7 @@ kind: Kustomization namespace: node-feature-discovery resources: -- topology-gc.yaml +- gc-clusterrole.yaml +- gc-clusterrolebinding.yaml +- gc-serviceaccount.yaml +- gc.yaml diff --git a/deployment/base/rbac-topology-gc/kustomization.yaml b/deployment/base/rbac-topology-gc/kustomization.yaml deleted file mode 100644 index d0105ebc05..0000000000 --- a/deployment/base/rbac-topology-gc/kustomization.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: node-feature-discovery - -resources: -- topology-gc-clusterrole.yaml -- topology-gc-clusterrolebinding.yaml -- topology-gc-serviceaccount.yaml diff --git a/deployment/overlays/master-worker-topologyupdater/kustomization.yaml b/deployment/overlays/master-worker-topologyupdater/kustomization.yaml index 19fa1bfcc8..15f72bb5c9 100644 --- a/deployment/overlays/master-worker-topologyupdater/kustomization.yaml +++ b/deployment/overlays/master-worker-topologyupdater/kustomization.yaml @@ -6,13 +6,12 @@ namespace: node-feature-discovery resources: - ../../base/rbac - ../../base/rbac-topologyupdater -- ../../base/rbac-topology-gc - ../../base/nfd-crds - ../../base/master - ../../base/worker-daemonset - ../../base/noderesourcetopologies-crd - ../../base/topologyupdater-daemonset -- ../../base/topology-gc +- ../../base/gc - namespace.yaml components: diff --git a/deployment/overlays/topologyupdater/kustomization.yaml b/deployment/overlays/topologyupdater/kustomization.yaml index e001d62ad6..f78d505ca2 100644 --- a/deployment/overlays/topologyupdater/kustomization.yaml +++ b/deployment/overlays/topologyupdater/kustomization.yaml @@ -5,10 +5,9 @@ namespace: node-feature-discovery resources: - ../../base/rbac-topologyupdater -- ../../base/rbac-topology-gc - ../../base/noderesourcetopologies-crd - ../../base/topologyupdater-daemonset -- ../../base/topology-gc +- ../../base/gc - namespace.yaml components: From ceb672bde01345e3519c7fd8ea22cfe95245e143 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Fri, 18 Aug 2023 12:52:41 +0300 Subject: [PATCH 5/6] deployment/helm: support nfd-gc Rename files and parameters. Drop the container security context parameters from the Helm chart. There should be no reason to run the nfd-gc with other than the minimal privileges. Also updates the documentation. --- .../templates/_helpers.tpl | 10 ++--- .../templates/clusterrole.yaml | 13 ++++-- .../templates/clusterrolebinding.yaml | 10 ++--- .../templates/nfd-gc.yaml | 40 ++++++++++--------- .../templates/serviceaccount.yaml | 10 ++--- .../helm/node-feature-discovery/values.yaml | 8 +--- docs/deployment/helm.md | 31 +++++++------- 7 files changed, 63 insertions(+), 59 deletions(-) diff --git a/deployment/helm/node-feature-discovery/templates/_helpers.tpl b/deployment/helm/node-feature-discovery/templates/_helpers.tpl index 5a0a5c97f7..928ece78f8 100644 --- a/deployment/helm/node-feature-discovery/templates/_helpers.tpl +++ b/deployment/helm/node-feature-discovery/templates/_helpers.tpl @@ -96,12 +96,12 @@ Create the name of the service account which topologyUpdater will use {{- end -}} {{/* -Create the name of the service account which topologyGC will use +Create the name of the service account which nfd-gc will use */}} -{{- define "node-feature-discovery.topologyGC.serviceAccountName" -}} -{{- if .Values.topologyGC.serviceAccount.create -}} - {{ default (printf "%s-topology-gc" (include "node-feature-discovery.fullname" .)) .Values.topologyGC.serviceAccount.name }} +{{- define "node-feature-discovery.gc.serviceAccountName" -}} +{{- if .Values.gc.serviceAccount.create -}} + {{ default (printf "%s-gc" (include "node-feature-discovery.fullname" .)) .Values.gc.serviceAccount.name }} {{- else -}} - {{ default "default" .Values.topologyGC.serviceAccount.name }} + {{ default "default" .Values.gc.serviceAccount.name }} {{- end -}} {{- end -}} diff --git a/deployment/helm/node-feature-discovery/templates/clusterrole.yaml b/deployment/helm/node-feature-discovery/templates/clusterrole.yaml index 0cec23b663..d4329338be 100644 --- a/deployment/helm/node-feature-discovery/templates/clusterrole.yaml +++ b/deployment/helm/node-feature-discovery/templates/clusterrole.yaml @@ -42,8 +42,8 @@ rules: - update {{- end }} ---- {{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.rbac.create }} +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: @@ -80,12 +80,12 @@ rules: - update {{- end }} +{{- if and .Values.gc.enable .Values.gc.rbac.create (or .Values.enableNodeFeatureApi .Values.topologyUpdater.enable) }} --- -{{- if and .Values.topologyGC.enable .Values.topologyGC.rbac.create .Values.topologyUpdater.enable }} apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: {{ include "node-feature-discovery.fullname" . }}-topology-gc + name: {{ include "node-feature-discovery.fullname" . }}-gc labels: {{- include "node-feature-discovery.labels" . | nindent 4 }} rules: @@ -109,4 +109,11 @@ rules: verbs: - delete - list +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeatures + verbs: + - delete + - list {{- end }} diff --git a/deployment/helm/node-feature-discovery/templates/clusterrolebinding.yaml b/deployment/helm/node-feature-discovery/templates/clusterrolebinding.yaml index b0a69012fd..8e3aef83e1 100644 --- a/deployment/helm/node-feature-discovery/templates/clusterrolebinding.yaml +++ b/deployment/helm/node-feature-discovery/templates/clusterrolebinding.yaml @@ -15,8 +15,8 @@ subjects: namespace: {{ include "node-feature-discovery.namespace" . }} {{- end }} ---- {{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.rbac.create }} +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: @@ -33,20 +33,20 @@ subjects: namespace: {{ include "node-feature-discovery.namespace" . }} {{- end }} +{{- if and .Values.gc.enable .Values.gc.rbac.create (or .Values.enableNodeFeatureApi .Values.topologyUpdater.enable) }} --- -{{- if and .Values.topologyGC.enable .Values.topologyGC.rbac.create .Values.topologyUpdater.enable }} apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: {{ include "node-feature-discovery.fullname" . }}-topology-gc + name: {{ include "node-feature-discovery.fullname" . }}-gc labels: {{- include "node-feature-discovery.labels" . | nindent 4 }} roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: {{ include "node-feature-discovery.fullname" . }}-topology-gc + name: {{ include "node-feature-discovery.fullname" . }}-gc subjects: - kind: ServiceAccount - name: {{ .Values.topologyGC.serviceAccount.name | default "nfd-topology-gc" }} + name: {{ .Values.gc.serviceAccount.name | default "nfd-gc" }} namespace: {{ include "node-feature-discovery.namespace" . }} {{- end }} diff --git a/deployment/helm/node-feature-discovery/templates/nfd-gc.yaml b/deployment/helm/node-feature-discovery/templates/nfd-gc.yaml index 642fec4559..ff0859ea14 100644 --- a/deployment/helm/node-feature-discovery/templates/nfd-gc.yaml +++ b/deployment/helm/node-feature-discovery/templates/nfd-gc.yaml @@ -1,36 +1,36 @@ -{{- if and .Values.topologyGC.enable .Values.topologyUpdater.enable -}} +{{- if and .Values.gc.enable (or .Values.enableNodeFeatureApi .Values.topologyUpdater.enable) -}} apiVersion: apps/v1 kind: Deployment metadata: - name: {{ include "node-feature-discovery.fullname" . }}-topology-gc + name: {{ include "node-feature-discovery.fullname" . }}-gc namespace: {{ include "node-feature-discovery.namespace" . }} labels: {{- include "node-feature-discovery.labels" . | nindent 4 }} - role: topology-gc + role: gc spec: - replicas: {{ .Values.topologyGC.replicaCount | default 1 }} + replicas: {{ .Values.gc.replicaCount | default 1 }} selector: matchLabels: {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} - role: topology-gc + role: gc template: metadata: labels: {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} - role: topology-gc + role: gc annotations: - {{- toYaml .Values.topologyGC.annotations | nindent 8 }} + {{- toYaml .Values.gc.annotations | nindent 8 }} spec: - serviceAccountName: {{ .Values.topologyGC.serviceAccountName | default "nfd-topology-gc" }} + serviceAccountName: {{ .Values.gc.serviceAccountName | default "nfd-gc" }} dnsPolicy: ClusterFirstWithHostNet {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} {{- end }} securityContext: - {{- toYaml .Values.topologyGC.podSecurityContext | nindent 8 }} + {{- toYaml .Values.gc.podSecurityContext | nindent 8 }} containers: - - name: topology-gc + - name: gc image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: "{{ .Values.image.pullPolicy }}" env: @@ -39,25 +39,29 @@ spec: fieldRef: fieldPath: spec.nodeName command: - - "nfd-topology-gc" + - "nfd-gc" args: - {{- if .Values.topologyGC.interval | empty | not }} - - "-gc-interval={{ .Values.topologyGC.interval }}" + {{- if .Values.gc.interval | empty | not }} + - "-gc-interval={{ .Values.gc.interval }}" {{- end }} resources: - {{- toYaml .Values.topologyGC.resources | nindent 12 }} + {{- toYaml .Values.gc.resources | nindent 12 }} securityContext: - {{- toYaml .Values.topologyGC.securityContext | nindent 12 }} + allowPrivilegeEscalation: false + capabilities: + drop: [ "ALL" ] + readOnlyRootFilesystem: true + runAsNonRoot: true - {{- with .Values.topologyGC.nodeSelector }} + {{- with .Values.gc.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} {{- end }} - {{- with .Values.topologyGC.affinity }} + {{- with .Values.gc.affinity }} affinity: {{- toYaml . | nindent 8 }} {{- end }} - {{- with .Values.topologyGC.tolerations }} + {{- with .Values.gc.tolerations }} tolerations: {{- toYaml . | nindent 8 }} {{- end }} diff --git a/deployment/helm/node-feature-discovery/templates/serviceaccount.yaml b/deployment/helm/node-feature-discovery/templates/serviceaccount.yaml index 03211e7c49..dae09503e4 100644 --- a/deployment/helm/node-feature-discovery/templates/serviceaccount.yaml +++ b/deployment/helm/node-feature-discovery/templates/serviceaccount.yaml @@ -12,8 +12,8 @@ metadata: {{- end }} {{- end }} ---- {{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.serviceAccount.create }} +--- apiVersion: v1 kind: ServiceAccount metadata: @@ -27,23 +27,23 @@ metadata: {{- end }} {{- end }} +{{- if and .Values.gc.enable .Values.gc.rbac.create (or .Values.enableNodeFeatureApi .Values.topologyUpdater.enable) }} --- -{{- if and .Values.topologyGC.enable .Values.topologyGC.serviceAccount.create .Values.topologyUpdater.enable }} apiVersion: v1 kind: ServiceAccount metadata: - name: {{ .Values.topologyGC.serviceAccount.name | default "nfd-topology-gc" }} + name: {{ .Values.gc.serviceAccount.name | default "nfd-gc" }} namespace: {{ include "node-feature-discovery.namespace" . }} labels: {{- include "node-feature-discovery.labels" . | nindent 4 }} - {{- with .Values.topologyUpdater.serviceAccount.annotations }} + {{- with .Values.gc.serviceAccount.annotations }} annotations: {{- toYaml . | nindent 4 }} {{- end }} {{- end }} ---- {{- if .Values.worker.serviceAccount.create }} +--- apiVersion: v1 kind: ServiceAccount metadata: diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index a259560665..80c50bf614 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -450,7 +450,7 @@ topologyUpdater: affinity: {} podSetFingerprint: true -topologyGC: +gc: enable: true replicaCount: 1 @@ -464,12 +464,6 @@ topologyGC: interval: 1h podSecurityContext: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: [ "ALL" ] - readOnlyRootFilesystem: true - runAsNonRoot: true resources: {} # We usually recommend not to specify default resources and to leave this as a conscious diff --git a/docs/deployment/helm.md b/docs/deployment/helm.md index bc09e85c88..5056f9bdc5 100644 --- a/docs/deployment/helm.md +++ b/docs/deployment/helm.md @@ -189,24 +189,23 @@ API's you need to install the prometheus operator in your cluster. | `topologyUpdater.podSetFingerprint` | bool | false | Enables compute and report of pod fingerprint in NRT objects. | | `topologyUpdater.kubeletStateDir` | string | /var/lib/kubelet | Specifies kubelet state directory path for watching state and checkpoint files. Empty value disables kubelet state tracking. | -### Topology garbage collector parameters +### Garbage collector parameters | Name | Type | Default | description | -|-----------------------------------------------|--------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `topologyGC.*` | dict | | NFD Topology Garbage Collector configuration | -| `topologyGC.enable` | bool | true | Specifies whether the NFD Topology Garbage Collector should be created | -| `topologyGC.serviceAccount.create` | bool | true | Specifies whether the service account for topology garbage collector should be created | -| `topologyGC.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology garbage collector | -| `topologyGC.serviceAccount.name` | string | | The name of the service account for topology garbage collector to use. If not set and create is true, a name is generated using the fullname template and `-topology-gc` suffix | -| `topologyGC.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology garbage collector | -| `topologyGC.interval` | string | 1h | Time between periodic garbage collector runs | -| `topologyGC.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settings | -| `topologyGC.securityContext` | dict | {} | Container [security settings](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container) | -| `topologyGC.resources` | dict | {} | Topology garbage collector pod [resources management](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) | -| `topologyGC.nodeSelector` | dict | {} | Topology garbage collector pod [node selector](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector) | -| `topologyGC.tolerations` | dict | {} | Topology garbage collector pod [node tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) | -| `topologyGC.annotations` | dict | {} | Topology garbage collector pod [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) | -| `topologyGC.affinity` | dict | {} | Topology garbage collector pod [affinity](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/) | +|---------------------------------------|--------|---------|-------------------- +| `gc.*` | dict | | NFD Garbage Collector configuration +| `gc.enable` | bool | true | Specifies whether the NFD Garbage Collector should be created +| `gc.serviceAccount.create` | bool | true | Specifies whether the service account for garbage collector should be created +| `gc.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for garbage collector +| `gc.serviceAccount.name` | string | | The name of the service account for garbage collector to use. If not set and create is true, a name is generated using the fullname template and `-gc` suffix +| `gc.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for garbage collector +| `gc.interval` | string | 1h | Time between periodic garbage collector runs +| `gc.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settings +| `gc.resources` | dict | {} | Garbage collector pod [resources management](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) +| `gc.nodeSelector` | dict | {} | Garbage collector pod [node selector](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector) +| `gc.tolerations` | dict | {} | Garbage collector pod [node tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) +| `gc.annotations` | dict | {} | Garbage collector pod [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) +| `gc.affinity` | dict | {} | Garbage collector pod [affinity](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/) [rbac]: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ From a15b5690b6a48a24b07f828e4b0145a49d6a996f Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Fri, 18 Aug 2023 14:38:13 +0300 Subject: [PATCH 6/6] docs: update to cover nfd-gc --- docs/get-started/introduction.md | 12 ++++---- ...ference.md => gc-commandline-reference.md} | 10 +++---- docs/usage/nfd-gc.md | 30 +++++++++++++++++++ docs/usage/nfd-topology-gc.md | 29 ------------------ 4 files changed, 41 insertions(+), 40 deletions(-) rename docs/reference/{topology-gc-commandline-reference.md => gc-commandline-reference.md} (63%) create mode 100644 docs/usage/nfd-gc.md delete mode 100644 docs/usage/nfd-topology-gc.md diff --git a/docs/get-started/introduction.md b/docs/get-started/introduction.md index 8f34ee839c..f69d6dc3e4 100644 --- a/docs/get-started/introduction.md +++ b/docs/get-started/introduction.md @@ -26,7 +26,7 @@ NFD consists of four software components: 1. nfd-master 1. nfd-worker 1. nfd-topology-updater -1. nfd-topology-gc +1. nfd-gc ## NFD-Master @@ -50,13 +50,13 @@ creates or updates a resource object specific to this node. One instance of nfd-topology-updater is supposed to be running on each node of the cluster. -## NFD-Topology-GC +## NFD-GC -NFD-Topology-GC is a daemon responsible for cleaning obsolete -[NodeResourceTopology](../usage/custom-resources.md#noderesourcetopology) objects, -obsolete means that there is no corresponding worker node. +NFD-GC is a daemon responsible for cleaning obsolete +[NodeFeature](../usage/custom-resources.md#nodefeature) and +[NodeResourceTopology](../usage/custom-resources.md#noderesourcetopology) objects. -One instance of nfd-topology-gc is supposed to be running in the cluster. +One instance of nfd-gc is supposed to be running in the cluster. ## Feature Discovery diff --git a/docs/reference/topology-gc-commandline-reference.md b/docs/reference/gc-commandline-reference.md similarity index 63% rename from docs/reference/topology-gc-commandline-reference.md rename to docs/reference/gc-commandline-reference.md index cf44bd33cd..101adaf280 100644 --- a/docs/reference/topology-gc-commandline-reference.md +++ b/docs/reference/gc-commandline-reference.md @@ -1,10 +1,10 @@ --- -title: "Topology Garbage Collector Cmdline Reference" +title: "Garbage Collector Cmdline Reference" layout: default sort: 7 --- -# NFD-Topology-Garbage-Collector Commandline Flags +# NFD-GC Commandline Flags {: .no_toc } ## Table of Contents @@ -15,12 +15,12 @@ sort: 7 --- -To quickly view available command line flags execute `nfd-topology-gc -help`. +To quickly view available command line flags execute `nfd-gc -help`. In a docker container: ```bash docker run {{ site.container_image }} \ -nfd-topology-gc -help +nfd-gc -help ``` ### -h, -help @@ -40,5 +40,5 @@ Default: 1h Example: ```bash -nfd-topology-gc -gc-interval=1h +nfd-gc -gc-interval=1h ``` diff --git a/docs/usage/nfd-gc.md b/docs/usage/nfd-gc.md new file mode 100644 index 0000000000..b14282989a --- /dev/null +++ b/docs/usage/nfd-gc.md @@ -0,0 +1,30 @@ +--- +title: "NFD-Garbage-Collector" +layout: default +sort: 6 +--- + +# NFD-GC +{: .no_toc} + +--- + +NFD-GC (NFD Garbage-Collector) is preferably run as a Kubernetes deployment +with one replica. It makes sure that all +[NodeFeature](custom-resources.md#nodefeature) and +[NodeResourceTopology](custom-resources.md#noderesourcetopology) objects +have corresponding nodes and removes stale objects for non-existent nodes. + +The daemon watches for Node deletion events and removes NodeFeature and +NodeResourceTopology objects upon them. It also runs periodically to make sure +no node delete event was missed and to remove any NodeFeature or +NodeResourceTopology objects that were created without corresponding node. The +default garbage collector interval is set to 1h which is the value when no +-gc-interval is specified. + +## Configuration + +In Helm deployments (see +[garbage collector parameters](../deployment/helm.md#garbage-collector-parameters)) +NFD-GC will only be deployed when `enableNodeFeatureApi` or +`topologyUpdater.enable` is set to true. diff --git a/docs/usage/nfd-topology-gc.md b/docs/usage/nfd-topology-gc.md deleted file mode 100644 index e87ef5d52e..0000000000 --- a/docs/usage/nfd-topology-gc.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: "NFD-Topology-Garbage-Collector" -layout: default -sort: 6 ---- - -# NFD-Topology-Garbage-Collector -{: .no_toc} - ---- - -NFD-Topology-Garbage-Collector is preferably run as a Kubernetes deployment -with one replica. It makes sure that all -[NodeResourceTopology](custom-resources.md#noderesourcetopology) -have corresponding worker nodes and removes stale objects for worker nodes -which are no longer part of Kubernetes cluster. - -This service watches for Node deletion events and removes NodeResourceTopology -objects upon them. It is also running periodically to make sure no event was -missed or NodeResourceTopology object was created without corresponding worker -node. The default garbage collector interval is set to 1h which is the value -when no -gc-interval is specified. - -## Topology-Garbage-Collector Configuration - -In Helm deployments, -(see [Topology Garbage Collector](../deployment/helm.md#topology-garbage-collector-parameters) -for parameters). NFD-Topology-Garbage-Collector will only be deployed when -topologyUpdater.enable is set to true.