Skip to content

Commit

Permalink
Merge pull request #439 from fromanirh/nrt-reserve-impl-enh
Browse files Browse the repository at this point in the history
Add local cache for noderesourcetopology using the reserve plugin [3/3] - enhancements
  • Loading branch information
k8s-ci-robot authored Dec 15, 2022
2 parents a4d42b7 + 5391088 commit 07d6327
Show file tree
Hide file tree
Showing 16 changed files with 505 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ rules:
verbs: ["get", "list", "watch"]
- apiGroups: ["topology.node.k8s.io"]
resources: ["noderesourcetopologies"]
verbs: ["*"]
verbs: ["get", "list", "watch"]
# resources need to be updated with the scheduler plugins used
- apiGroups: ["scheduling.sigs.k8s.io"]
resources: ["podgroups", "elasticquotas"]
Expand Down Expand Up @@ -93,7 +93,7 @@ rules:
verbs: ["get", "list", "watch", "patch"]
- apiGroups: ["topology.node.k8s.io"]
resources: ["noderesourcetopologies"]
verbs: ["*"]
verbs: ["get", "list", "watch"]
# resources need to be updated with the scheduler plugins used
- apiGroups: ["scheduling.sigs.k8s.io"]
resources: ["podgroups", "elasticquotas"]
Expand Down
2 changes: 1 addition & 1 deletion manifests/noderesourcetopology/cluster-role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ metadata:
rules:
- apiGroups: ["topology.node.k8s.io"]
resources: ["noderesourcetopologies"]
verbs: ["*"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "patch"]
Expand Down
44 changes: 42 additions & 2 deletions pkg/noderesourcetopology/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ Document capturing the NodeResourceTopology API Custom Resource Definition Stand
<!-- Check one of the values: Sample, Alpha, Beta, GA -->

- [ ] 💡 Sample (for demonstrating and inspiring purpose)
- [x] 👶 Alpha (used in companies for pilot projects)
- [ ] 👦 Beta (used in companies and developed actively)
- [ ] 👶 Alpha (used in companies for pilot projects)
- [x] 👦 Beta (used in companies and developed actively)
- [ ] 👨 Stable (used in companies for production workloads)

## Tutorial
Expand Down Expand Up @@ -58,6 +58,46 @@ profiles:
type: "LeastAllocated"
```
#### Scheduler-side cache with the reserve plugin
The quality of the scheduling decisions of the "NodeResourceTopologyMatch" filter and score plugins depends on the freshness of the resource allocation data.
When deployed on large clusters, or when facing high pod churn, or both, it's often impractical or impossible to have frequent enough updates, and the scheduler plugins
may run with stale data, leading to suboptimal scheduling decisions.
Using the Reserve plugin, the "NodeResourceTopologyMatch" Filter and Score can use a pessimistic overreserving cache which prevents these suboptimal decisions at the cost
of leaving pods pending longer. This cache is described in detail in [the docs/ directory](docs/).
To enable the cache, you need to **both** enable the Reserve plugin and to set the `cacheResyncPeriodSeconds` config options. Values less than 5 seconds are not recommended
for performance reasons.

```yaml
apiVersion: kubescheduler.config.k8s.io/v1beta2
kind: KubeSchedulerConfiguration
leaderElection:
leaderElect: false
clientConnection:
kubeconfig: "/etc/kubernetes/scheduler.conf"
profiles:
- schedulerName: topo-aware-scheduler
plugins:
filter:
enabled:
- name: NodeResourceTopologyMatch
reserve:
enabled:
- name: NodeResourceTopologyMatch
score:
enabled:
- name: NodeResourceTopologyMatch
# optional plugin configs
pluginConfig:
- name: NodeResourceTopologyMatch
args:
# other strategies are MostAllocated and BalancedAllocation
scoringStrategy:
type: "LeastAllocated"
cacheResyncPeriodSeconds: 5
```

#### Cluster

The Topology-aware scheduler performs its decision over a number of node-specific hardware details or configuration settings which have node granularity (not at cluster granularity).
Expand Down
12 changes: 11 additions & 1 deletion pkg/noderesourcetopology/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ type Interface interface {
// Over-reserved resources are the resources consumed by pods scheduled to that node after the last update
// of NRT pertaining to the same node, pessimistically overallocated on ALL the NUMA zones of the node.
// The pod argument is used only for logging purposes.
GetCachedNRTCopy(nodeName string, pod *corev1.Pod) *topologyv1alpha1.NodeResourceTopology
// Returns a boolean to signal the caller if the NRT data is clean. If false, then the node has foreign
// Pods detected - so it should be ignored or handled differently by the caller.
GetCachedNRTCopy(nodeName string, pod *corev1.Pod) (*topologyv1alpha1.NodeResourceTopology, bool)

// NodeMaybeOverReserved declares a node was filtered out for not enough resources available.
// This means this node is eligible for a resync. When a node is marked discarded (dirty), it matters not
Expand All @@ -37,6 +39,14 @@ type Interface interface {
// The pod argument is used only for logging purposes.
NodeMaybeOverReserved(nodeName string, pod *corev1.Pod)

// NodeHasForeignPods declares we observed one or more pods on this node which wasn't scheduled by this
// scheduler instance. This means the resource accounting is likely out of date, so this function also signals
// a cache resync is needed for this node.
// Until that happens, this node should not be considered in the scheduling decisions, like it has zero resources
// available. Note this condition is different from "no topology info available".
// The former is a always-fail, the latter is a always-succeed.
NodeHasForeignPods(nodeName string, pod *corev1.Pod)

// ReserveNodeResources add the resources requested by a pod to the assumed resources for the node on which the pod
// is scheduled on. This is a prerequesite for the pessimistic overallocation tracking.
// Additionally, this function resets the discarded counter for the same node. Being able to handle a pod means
Expand Down
81 changes: 81 additions & 0 deletions pkg/noderesourcetopology/cache/foreign_pods.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package cache

import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
k8scache "k8s.io/client-go/tools/cache"
"k8s.io/klog/v2"
)

// The nodeIndexer and the go client facilities are global objects, so we need this to be global as well.
// Each profile must register its name in order to support foreign pods detection. Foreign pods are pods
// which are scheduled to nodes without the caching machinery knowning.
// Note this is NOT lock-protected because the scheduling framework calls New() sequentially,
// and plugin instances are meant to register their name using SetupForeignPodsDetector inside their New()
var (
schedProfileNames = sets.String{}
)

func SetupForeignPodsDetector(schedProfileName string, podInformer k8scache.SharedInformer, cc Interface) {
foreignCache := func(obj interface{}) {
pod, ok := obj.(*corev1.Pod)
if !ok {
klog.V(3).InfoS("nrtcache: foreign: unsupported object %T", obj)
return
}
if !IsForeignPod(pod) {
return
}

cc.NodeHasForeignPods(pod.Spec.NodeName, pod)
klog.V(6).InfoS("nrtcache: has foreign pods", "logID", klog.KObj(pod), "node", pod.Spec.NodeName, "podUID", pod.UID)
}

podInformer.AddEventHandler(k8scache.ResourceEventHandlerFuncs{
AddFunc: foreignCache,
UpdateFunc: func(oldObj, newObj interface{}) {
foreignCache(newObj)
},
DeleteFunc: foreignCache,
})
}

func RegisterSchedulerProfileName(schedProfileName string) {
klog.InfoS("nrtcache: setting up foreign pod detection", "profile", schedProfileName)
schedProfileNames.Insert(schedProfileName)

klog.V(5).InfoS("nrtcache: registered scheduler profiles", "names", schedProfileNames.List())
}

func IsForeignPod(pod *corev1.Pod) bool {
if pod.Spec.NodeName == "" {
// nothing to do yet
return false
}
if schedProfileNames.Has(pod.Spec.SchedulerName) {
// nothing to do here - we know already about this pod
return false
}
return true
}

// for testing only; NOT thread safe
func CleanRegisteredSchedulerProfileNames() {
schedProfileNames = sets.String{}
}
109 changes: 109 additions & 0 deletions pkg/noderesourcetopology/cache/foreign_pods_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package cache

import (
"testing"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestIsForeignPod(t *testing.T) {
tests := []struct {
name string
profileNames []string
pod *corev1.Pod
expected bool
}{
{
name: "empty",
pod: &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod",
Namespace: "default",
},
},
},
{
name: "no-node-no-profile",
profileNames: []string{"secondary-scheduler"},
pod: &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod",
Namespace: "default",
},
},
},
{
name: "node-no-profile",
profileNames: []string{"secondary-scheduler"},
pod: &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod",
Namespace: "default",
},
Spec: corev1.PodSpec{
NodeName: "random-node",
},
},
expected: true,
},
{
name: "node-profile",
profileNames: []string{"secondary-scheduler"},
pod: &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod",
Namespace: "default",
},
Spec: corev1.PodSpec{
NodeName: "random-node",
SchedulerName: "secondary-scheduler",
},
},
},
{
name: "node-multi-profile",
profileNames: []string{"secondary-scheduler-A", "secondary-scheduler-B", "fancy-scheduler"},
pod: &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod",
Namespace: "default",
},
Spec: corev1.PodSpec{
NodeName: "random-node",
SchedulerName: "secondary-scheduler-B",
},
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
for _, profileName := range tt.profileNames {
RegisterSchedulerProfileName(profileName)
}
defer CleanRegisteredSchedulerProfileNames()

got := IsForeignPod(tt.pod)
if got != tt.expected {
t.Errorf("pod %s foreign status got %v expected %v", tt.pod.Name, got, tt.expected)
}
})
}
}
10 changes: 9 additions & 1 deletion pkg/noderesourcetopology/cache/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,15 @@ func NewNodeNameIndexer(podInformer k8scache.SharedInformer) NodeIndexer {
}
nni.addPod(pod)
},
// Updates are not yet supported
// need to track pods scheduled by the default scheduler
UpdateFunc: func(oldObj, newObj interface{}) {
pod, ok := newObj.(*corev1.Pod)
if !ok {
klog.V(3).InfoS("nrtcache: nni: update unsupported object %T", newObj)
return
}
nni.addPod(pod)
},
DeleteFunc: func(obj interface{}) {
pod, ok := obj.(*corev1.Pod)
if !ok {
Expand Down
1 change: 0 additions & 1 deletion pkg/noderesourcetopology/cache/indexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ func TestNodeNameIndexerByIndexAdd(t *testing.T) {
}
})
}

}

const (
Expand Down
Loading

0 comments on commit 07d6327

Please sign in to comment.