Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fix possible panic when 'SetNode' is called #1685

Merged
merged 1 commit into from
Oct 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 25 additions & 8 deletions pkg/scheduler/api/node_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -303,17 +303,34 @@ func (ni *NodeInfo) setNodeGPUInfo(node *v1.Node) {

// SetNode sets kubernetes node object to nodeInfo object
func (ni *NodeInfo) SetNode(node *v1.Node) {
ni.setOversubscription(node)
ni.setNodeState(node)
ni.setNodeGPUInfo(node)
ni.setRevocableZone(node)

if !ni.Ready() {
klog.Warningf("Failed to set node info, phase: %s, reason: %s",
ni.State.Phase, ni.State.Reason)
klog.Warningf("Failed to set node info for %s, phase: %s, reason: %s",
ni.Name, ni.State.Phase, ni.State.Reason)
return
}

// Dry run, make sure all fields other than `State` are in the original state.
copy := ni.Clone()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why clone?

Copy link
Member Author

@eggiter eggiter Aug 30, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make make sure all fields are left untouched and in the original state, as i commented.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you point where do we mutate this node?

Copy link
Member Author

@eggiter eggiter Aug 31, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • The original codes are all changing fields of nodeInfo.
  • Here, the clone i used, is merely a dry-run to make sure if all these changes makes this node not ready then we shall not to actually do this operation and just set state of this node to notready.

copy.setNode(node)
copy.setNodeState(node)
if !copy.Ready() {
klog.Warningf("SetNode makes node %s not ready, phase: %s, reason: %s",
copy.Name, copy.State.Phase, copy.State.Reason)
// Set state of node to !Ready, left other fields untouched
ni.State = copy.State
return
}

ni.setNode(node)
}

// setNode sets kubernetes node object to nodeInfo object without assertion
func (ni *NodeInfo) setNode(node *v1.Node) {
ni.setOversubscription(node)
ni.setNodeGPUInfo(node)
ni.setRevocableZone(node)

ni.Name = node.Name
ni.Node = node

Expand All @@ -327,14 +344,14 @@ func (ni *NodeInfo) SetNode(node *v1.Node) {
for _, ti := range ni.Tasks {
switch ti.Status {
case Releasing:
ni.Idle.Sub(ti.Resreq)
ni.Idle.sub(ti.Resreq) // sub without assertion
ni.Releasing.Add(ti.Resreq)
ni.Used.Add(ti.Resreq)
ni.AddGPUResource(ti.Pod)
case Pipelined:
ni.Pipelined.Add(ti.Resreq)
default:
ni.Idle.Sub(ti.Resreq)
ni.Idle.sub(ti.Resreq) // sub without assertion
ni.Used.Add(ti.Resreq)
ni.AddGPUResource(ti.Pod)
}
Expand Down
66 changes: 66 additions & 0 deletions pkg/scheduler/api/node_info_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,69 @@ func TestNodeInfo_RemovePod(t *testing.T) {
}
}
}

func TestNodeInfo_SetNode(t *testing.T) {
// case1
case01Node1 := buildNode("n1", buildResourceList("10", "10G"))
case01Node2 := buildNode("n1", buildResourceList("8", "8G"))
case01Pod1 := buildPod("c1", "p1", "n1", v1.PodRunning, buildResourceList("1", "1G"), []metav1.OwnerReference{}, make(map[string]string))
case01Pod2 := buildPod("c1", "p2", "n1", v1.PodRunning, buildResourceList("2", "2G"), []metav1.OwnerReference{}, make(map[string]string))
case01Pod3 := buildPod("c1", "p3", "n1", v1.PodRunning, buildResourceList("6", "6G"), []metav1.OwnerReference{}, make(map[string]string))

tests := []struct {
name string
node *v1.Node
updated *v1.Node
pods []*v1.Pod
expected *NodeInfo
}{
{
name: "add 3 running non-owner pod",
node: case01Node1,
updated: case01Node2,
pods: []*v1.Pod{case01Pod1, case01Pod2, case01Pod3},
expected: &NodeInfo{
Name: "n1",
Node: case01Node1,
Idle: buildResource("1", "1G"),
Used: buildResource("9", "9G"),
OversubscriptionResource: EmptyResource(),
Releasing: EmptyResource(),
Pipelined: EmptyResource(),
Allocatable: buildResource("10", "10G"),
Capability: buildResource("10", "10G"),
State: NodeState{Phase: NotReady, Reason: "OutOfSync"},
Tasks: map[TaskID]*TaskInfo{
"c1/p1": NewTaskInfo(case01Pod1),
"c1/p2": NewTaskInfo(case01Pod2),
"c1/p3": NewTaskInfo(case01Pod3),
},
GPUDevices: make(map[int]*GPUDevice),
},
},
}

for i, test := range tests {
ni := NewNodeInfo(test.node)
for _, pod := range test.pods {
pi := NewTaskInfo(pod)
ni.AddTask(pi)
ni.Name = pod.Spec.NodeName
}

// OutOfSync. e.g.: nvidia-device-plugin is down causes gpus turn from 8 to 0 (node.status.allocatable."nvidia.com/gpu": 0)
ni.SetNode(test.updated)
if !nodeInfoEqual(ni, test.expected) {
t.Errorf("node info %d: \n expected\t%v, \n got\t\t%v \n",
i, test.expected, ni)
}

// Recover. e.g.: nvidia-device-plugin is restarted successfully
ni.SetNode(test.node)
test.expected.State = NodeState{Phase: Ready}
if !nodeInfoEqual(ni, test.expected) {
t.Errorf("recovered %d: \n expected\t%v, \n got\t\t%v \n",
i, test.expected, ni)
}
}
}
6 changes: 5 additions & 1 deletion pkg/scheduler/api/resource_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,14 @@ func (r *Resource) Add(rr *Resource) *Resource {
return r
}

//Sub subtracts two Resource objects.
// Sub subtracts two Resource objects with assertion.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although this modification is clear, I think the original function is also acceptable.

func (r *Resource) Sub(rr *Resource) *Resource {
assert.Assertf(rr.LessEqual(r, Zero), "resource is not sufficient to do operation: <%v> sub <%v>", r, rr)
return r.sub(rr)
}

// sub subtracts two Resource objects.
func (r *Resource) sub(rr *Resource) *Resource {
r.MilliCPU -= rr.MilliCPU
r.Memory -= rr.Memory

Expand Down