From a71a35d4acaaac69db316e06172961f14e741a3a Mon Sep 17 00:00:00 2001 From: Benjamin Pineau Date: Fri, 19 Aug 2022 09:39:15 +0200 Subject: [PATCH] [local] Backward compatible spread topology - hack /!\ This is an unfortunately unavoidable change to vendor/, not meant to stay indefinitely. Implementation tries to be non intrusive and to avoids refactoring, to ease future rebases; that change should be removed when the cluster-autoscaler don't need to support Kubernetes clusters < k8s v1.24 anymore. The spreadtopology constraints' skew accounting changed slightly, which can lead cluster-autoscaler (>= 1.24) to leave pending pods on k8s clusters < 1.24: When evaluating nodes options for a pending pod having topology spread constraints, Kubernetes used (and continues) to inventory all the possible topology domains (eg. zones us-east-1a, us-east-1b, etc which it will try to use in a balanced way, with respect to the configured skew) by listing nodes running pods matching the provided labelSelector, and filtering out those that don't pass the tested pod's nodeAffinities. But when computing the number of instances per topology domain to evaluate skew, the Kubernetes scheduler (< 1.24) used to count all nodes having pods that matchs the labelSelector, irrespective of their conformance to the tested pod's nodeaffinity. This changed with Kubernetes commit 935cbc8e625e6f175a44e4490fecc7a25edf6d45 (refactored later on) which I think is part of k8s v1.24: now the scheduler also filters out nodes that don't match the tested pod nodeAffinities when counting pods per topology domain (computing the skew). Since the cluster-autoscaler 1.24 uses upstream's scheduler framework, it inherited that behaviour, and this can lead to diverging evaluations vs k8s scheduler (if the cluster's scheduler is < 1.24): one node could be considered as schedulable by the autoscaler (not triggering an upscale) while k8s scheduler would consider it wouldn't satisfy skew constraints. One example that can trigger that situation would be a deployment changing it's affinities (eg. to move to a new set of nodes) while older pods/nodes are already at maximum skew tolerance (eg. slightly unbalanced). For instance in that situation: We have a deployment configured like so: ``` labels: app: myapp replicas: 4 topologySpreadConstraints: - labelSelector: matchLabels: app: myapp maxSkew: 1 topologyKey: zone whenUnsatisfiable: DoNotSchedule nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: nodeset operator: In values: [foo] ``` At first the application could end-up distributed on nodeset=foo nodes as such: ``` node1 nodeset=foo zone=1 pod-1 app=myapp node2 nodeset=foo zone=2 pod-2 app=myapp node3 nodeset=foo zone=3 pod-3 app=myapp node4 nodeset=foo zone=1 # again pod-4 app=myapp node5 nodeset=bar zone=1 not used yet because doesn't match nodeaffinity node6 nodeset=bar zone=2 unschedulable (eg. full, or cordoned, ...) node7 nodeset=bar zone=3 unschedulable (eg. full, or cordoned, ...) ``` Then the application's affinity is updated to eg. `values: [bar]` and creates a new pod, part of its rollout (or is upscaled). With the older scheduler `podtopologyspread` predicate, we'd count: ``` zone 1: 2 app=myapp pods zone 2: 1 app=myapp pod zone 3: 1 app=myapp pod ``` so we can't use node5 on zone 1, because we're already hitting `maxSkew: 1` budget (have one excess pod) on that zone: we need a nodeset=bar upscale on zone 2 or 3. While the newer scheduler would only count pods running on nodeset=bar to compute skew, which would give: ``` zone 1: 0 app=myapp pods zone 2: 0 app=myapp pods zone 3: 0 app=myapp pods ``` which means the new pod can use any node, including the already available node5: no need for an upscale. --- .../plugins/podtopologyspread/filtering.go | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/filtering.go b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/filtering.go index 2b3580596e2a..201bbc76fbf5 100644 --- a/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/filtering.go +++ b/cluster-autoscaler/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/filtering.go @@ -20,6 +20,8 @@ import ( "context" "fmt" "math" + "os" + "sync" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" @@ -287,6 +289,41 @@ func (pl *PodTopologySpread) calPreFilterState(ctx context.Context, pod *v1.Pod) s.TpPairToMatchNum[tp] += count } } + + // backward compatibility with k8s < v1.24: also count (for skew evaluation) pods matching + // the spread label selector yet running on nodes not supported by this pod's node affinities + matchingPodsCountOnNonAffinityNodes := make(map[topologyPair]int, len(allNodes)) + var matchingPodsCountOnNonAffinityNodesMu sync.Mutex + pl.parallelizer.Until(ctx, len(allNodes), func(i int) { + if len(tpCountsByNode[i]) > 0 { + return // already accounted (nodes matching our pods' nodeaffinity) + } + if os.Getenv("SPREADTOPOLOGY_SKEW_FILTERS_NODEAFFINITY") != "disabled" { + return // we should only set that to "disabled" on k8s < 1.24 + } + nodeInfo := allNodes[i] + node := nodeInfo.Node() + if !nodeLabelsMatchSpreadConstraints(node.Labels, constraints) { + return // node not having the spreadconstraint key/label + } + for _, constraint := range constraints { + pair := topologyPair{key: constraint.TopologyKey, value: node.Labels[constraint.TopologyKey]} + if _, ok := s.TpPairToMatchNum[pair]; !ok { + // ignore spreadconstraint values (eg. zone names) that are not provided + // by any node running pods that match the spreadconstraint label selector + // AND that satisfy the pods' nodeaffinity (k8s < 1.24 behaviour). + continue + } + count := countPodsMatchSelector(nodeInfo.Pods, constraint.Selector, pod.Namespace) + matchingPodsCountOnNonAffinityNodesMu.Lock() + matchingPodsCountOnNonAffinityNodes[pair] += count + matchingPodsCountOnNonAffinityNodesMu.Unlock() + } + }) + for pair, count := range matchingPodsCountOnNonAffinityNodes { + s.TpPairToMatchNum[pair] += count + } + if pl.enableMinDomainsInPodTopologySpread { s.TpKeyToDomainsNum = make(map[string]int, len(constraints)) for tp := range s.TpPairToMatchNum {