Merge pull request #439 from fromanirh/nrt-reserve-impl-enh

Add local cache for noderesourcetopology using the reserve plugin [3/3] - enhancements
kubernetes-sigs · Dec 15, 2022 · 07d6327 · 07d6327
2 parents a4d42b7 + 5391088
commit 07d6327
Show file tree

Hide file tree

Showing 16 changed files with 505 additions and 41 deletions.
diff --git a/manifests/install/charts/as-a-second-scheduler/templates/rbac.yaml b/manifests/install/charts/as-a-second-scheduler/templates/rbac.yaml
@@ -61,7 +61,7 @@ rules:
   verbs: ["get", "list", "watch"]
 - apiGroups: ["topology.node.k8s.io"]
   resources: ["noderesourcetopologies"]
-  verbs: ["*"]
+  verbs: ["get", "list", "watch"]
 # resources need to be updated with the scheduler plugins used
 - apiGroups: ["scheduling.sigs.k8s.io"]
   resources: ["podgroups", "elasticquotas"]
@@ -93,7 +93,7 @@ rules:
   verbs: ["get", "list", "watch", "patch"]
 - apiGroups: ["topology.node.k8s.io"]
   resources: ["noderesourcetopologies"]
-  verbs: ["*"]
+  verbs: ["get", "list", "watch"]
 # resources need to be updated with the scheduler plugins used
 - apiGroups: ["scheduling.sigs.k8s.io"]
   resources: ["podgroups", "elasticquotas"]

diff --git a/manifests/noderesourcetopology/cluster-role.yaml b/manifests/noderesourcetopology/cluster-role.yaml
@@ -11,7 +11,7 @@ metadata:
 rules:
 - apiGroups: ["topology.node.k8s.io"]
   resources: ["noderesourcetopologies"]
-  verbs: ["*"]
+  verbs: ["get", "list", "watch"]
 - apiGroups: [""]
   resources: ["nodes"]
   verbs: ["get", "list", "patch"]

diff --git a/pkg/noderesourcetopology/README.md b/pkg/noderesourcetopology/README.md
@@ -9,8 +9,8 @@ Document capturing the NodeResourceTopology API Custom Resource Definition Stand
 <!-- Check one of the values: Sample, Alpha, Beta, GA -->
 
 - [ ] 💡 Sample (for demonstrating and inspiring purpose)
-- [x] 👶 Alpha (used in companies for pilot projects)
-- [ ] 👦 Beta (used in companies and developed actively)
+- [ ] 👶 Alpha (used in companies for pilot projects)
+- [x] 👦 Beta (used in companies and developed actively)
 - [ ] 👨 Stable (used in companies for production workloads)
 
 ## Tutorial
@@ -58,6 +58,46 @@ profiles:
         type: "LeastAllocated"
 ```
 
+#### Scheduler-side cache with the reserve plugin
+
+The quality of the scheduling decisions of the "NodeResourceTopologyMatch" filter and score plugins depends on the freshness of the resource allocation data.
+When deployed on large clusters, or when facing high pod churn, or both, it's often impractical or impossible to have frequent enough updates, and the scheduler plugins
+may run with stale data, leading to suboptimal scheduling decisions.
+Using the Reserve plugin, the "NodeResourceTopologyMatch" Filter and Score can use a pessimistic overreserving cache which prevents these suboptimal decisions at the cost
+of leaving pods pending longer. This cache is described in detail in [the docs/ directory](docs/).
+
+To enable the cache, you need to **both** enable the Reserve plugin and to set the `cacheResyncPeriodSeconds` config options. Values less than 5 seconds are not recommended
+for performance reasons.
+
+```yaml
+apiVersion: kubescheduler.config.k8s.io/v1beta2
+kind: KubeSchedulerConfiguration
+leaderElection:
+  leaderElect: false
+clientConnection:
+  kubeconfig: "/etc/kubernetes/scheduler.conf"
+profiles:
+- schedulerName: topo-aware-scheduler
+  plugins:
+    filter:
+      enabled:
+      - name: NodeResourceTopologyMatch
+    reserve:
+      enabled:
+      - name: NodeResourceTopologyMatch
+    score:
+      enabled:
+      - name: NodeResourceTopologyMatch
+# optional plugin configs
+  pluginConfig:
+  - name: NodeResourceTopologyMatch
+    args:
+      # other strategies are MostAllocated and BalancedAllocation
+      scoringStrategy:
+        type: "LeastAllocated"
+      cacheResyncPeriodSeconds: 5
+```
+
 #### Cluster
 
 The Topology-aware scheduler performs its decision over a number of node-specific hardware details or configuration settings which have node granularity (not at cluster granularity).

diff --git a/pkg/noderesourcetopology/cache/cache.go b/pkg/noderesourcetopology/cache/cache.go
@@ -28,7 +28,9 @@ type Interface interface {
 	// Over-reserved resources are the resources consumed by pods scheduled to that node after the last update
 	// of NRT pertaining to the same node, pessimistically overallocated on ALL the NUMA zones of the node.
 	// The pod argument is used only for logging purposes.
-	GetCachedNRTCopy(nodeName string, pod *corev1.Pod) *topologyv1alpha1.NodeResourceTopology
+	// Returns a boolean to signal the caller if the NRT data is clean. If false, then the node has foreign
+	// Pods detected - so it should be ignored or handled differently by the caller.
+	GetCachedNRTCopy(nodeName string, pod *corev1.Pod) (*topologyv1alpha1.NodeResourceTopology, bool)
 
 	// NodeMaybeOverReserved declares a node was filtered out for not enough resources available.
 	// This means this node is eligible for a resync. When a node is marked discarded (dirty), it matters not
@@ -37,6 +39,14 @@ type Interface interface {
 	// The pod argument is used only for logging purposes.
 	NodeMaybeOverReserved(nodeName string, pod *corev1.Pod)
 
+	// NodeHasForeignPods declares we observed one or more pods on this node which wasn't scheduled by this
+	// scheduler instance. This means the resource accounting is likely out of date, so this function also signals
+	// a cache resync is needed for this node.
+	// Until that happens, this node should not be considered in the scheduling decisions, like it has zero resources
+	// available. Note this condition is different from "no topology info available".
+	// The former is a always-fail, the latter is a always-succeed.
+	NodeHasForeignPods(nodeName string, pod *corev1.Pod)
+
 	// ReserveNodeResources add the resources requested by a pod to the assumed resources for the node on which the pod
 	// is scheduled on. This is a prerequesite for the pessimistic overallocation tracking.
 	// Additionally, this function resets the discarded counter for the same node. Being able to handle a pod means

diff --git a/pkg/noderesourcetopology/cache/foreign_pods.go b/pkg/noderesourcetopology/cache/foreign_pods.go
@@ -0,0 +1,81 @@
+/*
+Copyright 2022 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package cache
+
+import (
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/util/sets"
+	k8scache "k8s.io/client-go/tools/cache"
+	"k8s.io/klog/v2"
+)
+
+// The nodeIndexer and the go client facilities are global objects, so we need this to be global as well.
+// Each profile must register its name in order to support foreign pods detection. Foreign pods are pods
+// which are scheduled to nodes without the caching machinery knowning.
+// Note this is NOT lock-protected because the scheduling framework calls New() sequentially,
+// and plugin instances are meant to register their name using SetupForeignPodsDetector inside their New()
+var (
+	schedProfileNames = sets.String{}
+)
+
+func SetupForeignPodsDetector(schedProfileName string, podInformer k8scache.SharedInformer, cc Interface) {
+	foreignCache := func(obj interface{}) {
+		pod, ok := obj.(*corev1.Pod)
+		if !ok {
+			klog.V(3).InfoS("nrtcache: foreign: unsupported object %T", obj)
+			return
+		}
+		if !IsForeignPod(pod) {
+			return
+		}
+
+		cc.NodeHasForeignPods(pod.Spec.NodeName, pod)
+		klog.V(6).InfoS("nrtcache: has foreign pods", "logID", klog.KObj(pod), "node", pod.Spec.NodeName, "podUID", pod.UID)
+	}
+
+	podInformer.AddEventHandler(k8scache.ResourceEventHandlerFuncs{
+		AddFunc: foreignCache,
+		UpdateFunc: func(oldObj, newObj interface{}) {
+			foreignCache(newObj)
+		},
+		DeleteFunc: foreignCache,
+	})
+}
+
+func RegisterSchedulerProfileName(schedProfileName string) {
+	klog.InfoS("nrtcache: setting up foreign pod detection", "profile", schedProfileName)
+	schedProfileNames.Insert(schedProfileName)
+
+	klog.V(5).InfoS("nrtcache: registered scheduler profiles", "names", schedProfileNames.List())
+}
+
+func IsForeignPod(pod *corev1.Pod) bool {
+	if pod.Spec.NodeName == "" {
+		// nothing to do yet
+		return false
+	}
+	if schedProfileNames.Has(pod.Spec.SchedulerName) {
+		// nothing to do here - we know already about this pod
+		return false
+	}
+	return true
+}
+
+// for testing only; NOT thread safe
+func CleanRegisteredSchedulerProfileNames() {
+	schedProfileNames = sets.String{}
+}
diff --git a/pkg/noderesourcetopology/cache/foreign_pods_test.go b/pkg/noderesourcetopology/cache/foreign_pods_test.go
@@ -0,0 +1,109 @@
+/*
+Copyright 2022 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package cache
+
+import (
+	"testing"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+func TestIsForeignPod(t *testing.T) {
+	tests := []struct {
+		name         string
+		profileNames []string
+		pod          *corev1.Pod
+		expected     bool
+	}{
+		{
+			name: "empty",
+			pod: &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "pod",
+					Namespace: "default",
+				},
+			},
+		},
+		{
+			name:         "no-node-no-profile",
+			profileNames: []string{"secondary-scheduler"},
+			pod: &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "pod",
+					Namespace: "default",
+				},
+			},
+		},
+		{
+			name:         "node-no-profile",
+			profileNames: []string{"secondary-scheduler"},
+			pod: &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "pod",
+					Namespace: "default",
+				},
+				Spec: corev1.PodSpec{
+					NodeName: "random-node",
+				},
+			},
+			expected: true,
+		},
+		{
+			name:         "node-profile",
+			profileNames: []string{"secondary-scheduler"},
+			pod: &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "pod",
+					Namespace: "default",
+				},
+				Spec: corev1.PodSpec{
+					NodeName:      "random-node",
+					SchedulerName: "secondary-scheduler",
+				},
+			},
+		},
+		{
+			name:         "node-multi-profile",
+			profileNames: []string{"secondary-scheduler-A", "secondary-scheduler-B", "fancy-scheduler"},
+			pod: &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "pod",
+					Namespace: "default",
+				},
+				Spec: corev1.PodSpec{
+					NodeName:      "random-node",
+					SchedulerName: "secondary-scheduler-B",
+				},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			for _, profileName := range tt.profileNames {
+				RegisterSchedulerProfileName(profileName)
+			}
+			defer CleanRegisteredSchedulerProfileNames()
+
+			got := IsForeignPod(tt.pod)
+			if got != tt.expected {
+				t.Errorf("pod %s foreign status got %v expected %v", tt.pod.Name, got, tt.expected)
+			}
+		})
+	}
+}
diff --git a/pkg/noderesourcetopology/cache/indexer.go b/pkg/noderesourcetopology/cache/indexer.go
@@ -96,7 +96,15 @@ func NewNodeNameIndexer(podInformer k8scache.SharedInformer) NodeIndexer {
 			}
 			nni.addPod(pod)
 		},
-		// Updates are not yet supported
+		// need to track pods scheduled by the default scheduler
+		UpdateFunc: func(oldObj, newObj interface{}) {
+			pod, ok := newObj.(*corev1.Pod)
+			if !ok {
+				klog.V(3).InfoS("nrtcache: nni: update unsupported object %T", newObj)
+				return
+			}
+			nni.addPod(pod)
+		},
 		DeleteFunc: func(obj interface{}) {
 			pod, ok := obj.(*corev1.Pod)
 			if !ok {

diff --git a/pkg/noderesourcetopology/cache/indexer_test.go b/pkg/noderesourcetopology/cache/indexer_test.go
@@ -162,7 +162,6 @@ func TestNodeNameIndexerByIndexAdd(t *testing.T) {
 			}
 		})
 	}
-
 }
 
 const (
-Original file line number
+Diff line change
@@ Expand Up / @@ -162,7 +162,6 @@ func TestNodeNameIndexerByIndexAdd(t *testing.T) { @@
     			}
     		})
     	}
     }
     const (
@@ Expand Down @@