Skip to content

Commit

Permalink
nfd-master: re-try on node update failures
Browse files Browse the repository at this point in the history
Change the NFD API handler to re-try on node update failures. Will work
around transient failures, making sure that failed nodes (i.e. nodes
that we failed to update) don't need to wait for the 1 hour resync
period before being tried again.
  • Loading branch information
marquiz committed Apr 13, 2023
1 parent e75be0b commit 6b2d107
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions pkg/nfd-master/nfd-master.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,21 +329,25 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
case <-rateLimit:
// Check what we need to do
// TODO: we might want to update multiple nodes in parallel
errUpdateAll := false
errNodes := make(map[string]struct{})
if updateAll {
if err := m.nfdAPIUpdateAllNodes(); err != nil {
klog.Error(err)
errUpdateAll = true
}
} else {
for nodeName := range updateNodes {
if err := m.nfdAPIUpdateOneNode(nodeName); err != nil {
klog.Error(err)
errNodes[nodeName] = struct{}{}
}
}
}

// Reset "work queue" and timer
updateAll = false
updateNodes = make(map[string]struct{})
// Reset "work queue" and timer, will cause re-try if errors happened
updateAll = errUpdateAll
updateNodes = errNodes
rateLimit = time.After(time.Second)
}
}
Expand Down

0 comments on commit 6b2d107

Please sign in to comment.