From 4e283e34ee3e0ce1eb372cfb877c3d299675e0a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kuba=20Tu=C5=BCnik?= Date: Thu, 12 Dec 2024 14:20:51 +0100 Subject: [PATCH] CA: don't error out in HintingSimulator if a hinted Node is gone If a hinted Node is no longer in the cluster snapshot (e.g. it was a fake upcoming Node and the real one appeared). This was introduced in the recent PredicateChecker->PredicateSnapshot refactor. Previously, PredicateChecker.CheckPredicates() would return an error if the hinted Node was gone, and HintingSimulator treated this error the same as failing predicates - it would move on to the non-hinting logic. After the refactor, HintingSimulator explicitly errors out if it can't retrieve the hinted Node from the snapshot, so the behavior changed. I checked other CheckPredicates()/SchedulePod() callsites, and this is the only one when ignoring the missing Node makes sense. For the others, the Node is added to the snapshot just before the call, so it being missing should cause an error. --- .../simulator/scheduling/hinting_simulator.go | 3 ++- .../scheduling/hinting_simulator_test.go | 27 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/simulator/scheduling/hinting_simulator.go b/cluster-autoscaler/simulator/scheduling/hinting_simulator.go index 8a35877a2c44..768a8e7cc8ad 100644 --- a/cluster-autoscaler/simulator/scheduling/hinting_simulator.go +++ b/cluster-autoscaler/simulator/scheduling/hinting_simulator.go @@ -91,7 +91,8 @@ func (s *HintingSimulator) tryScheduleUsingHints(clusterSnapshot clustersnapshot nodeInfo, err := clusterSnapshot.GetNodeInfo(hintedNode) if err != nil { - return "", err + // The hinted Node is no longer in the cluster. No need to error out, we can just look for another one. + return "", nil } if !isNodeAcceptable(nodeInfo) { return "", nil diff --git a/cluster-autoscaler/simulator/scheduling/hinting_simulator_test.go b/cluster-autoscaler/simulator/scheduling/hinting_simulator_test.go index d4b86d256862..d317916519b5 100644 --- a/cluster-autoscaler/simulator/scheduling/hinting_simulator_test.go +++ b/cluster-autoscaler/simulator/scheduling/hinting_simulator_test.go @@ -35,6 +35,7 @@ func TestTrySchedulePods(t *testing.T) { nodes []*apiv1.Node pods []*apiv1.Pod newPods []*apiv1.Pod + hints map[*apiv1.Pod]string acceptableNodes func(*framework.NodeInfo) bool wantStatuses []Status wantErr bool @@ -58,6 +59,27 @@ func TestTrySchedulePods(t *testing.T) { {Pod: BuildTestPod("p3", 500, 500000), NodeName: "n1"}, }, }, + + { + desc: "hinted Node no longer in the cluster doesn't cause an error", + nodes: []*apiv1.Node{ + buildReadyNode("n1", 1000, 2000000), + buildReadyNode("n2", 1000, 2000000), + }, + pods: []*apiv1.Pod{ + buildScheduledPod("p1", 300, 500000, "n1"), + }, + newPods: []*apiv1.Pod{ + BuildTestPod("p2", 800, 500000), + BuildTestPod("p3", 500, 500000), + }, + hints: map[*apiv1.Pod]string{BuildTestPod("p2", 800, 500000): "non-existing-node"}, + acceptableNodes: ScheduleAnywhere, + wantStatuses: []Status{ + {Pod: BuildTestPod("p2", 800, 500000), NodeName: "n2"}, + {Pod: BuildTestPod("p3", 500, 500000), NodeName: "n1"}, + }, + }, { desc: "three new pods, two nodes, no fit", nodes: []*apiv1.Node{ @@ -136,6 +158,11 @@ func TestTrySchedulePods(t *testing.T) { clusterSnapshot := testsnapshot.NewTestSnapshotOrDie(t) clustersnapshot.InitializeClusterSnapshotOrDie(t, clusterSnapshot, tc.nodes, tc.pods) s := NewHintingSimulator() + + for pod, nodeName := range tc.hints { + s.hints.Set(HintKeyFromPod(pod), nodeName) + } + statuses, _, err := s.TrySchedulePods(clusterSnapshot, tc.newPods, tc.acceptableNodes, false) if tc.wantErr { assert.Error(t, err)