Delayed evaluations for stop_after_client_disconnect can cause unwa…

…nted extra followup evaluations around job garbage collection (#8099) * client/heartbeatstop: reversed time condition for startup grace * scheduler/generic_sched: use `delayInstead` to avoid a loop Without protecting the loop that creates followUpEvals, a delayed eval is allowed to create an immediate subsequent delayed eval. For both `stop_after_client_disconnect` and the `reschedule` block, a delayed eval should always produce some immediate result (running or blocked) and then only after the outcome of that eval produce a second delayed eval. * scheduler/reconcile: lostLater are different than delayedReschedules Just slightly. `lostLater` allocs should be used to create batched evaluations, but `handleDelayedReschedules` assumes that the allocations are in the untainted set. When it creates the in-place updates to those allocations at the end, it causes the allocation to be treated as running over in the planner, which causes the initial `stop_after_client_disconnect` evaluation to be retried by the worker.
hashicorp · Jun 3, 2020 · 846a2e5 · 846a2e5
1 parent c7bf94e
commit 846a2e5
Show file tree

Hide file tree

Showing 6 changed files with 36 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ BUG FIXES:
  * core: Fixed a bug where new leader may take a long time until it can process requests [[GH-8036](https://github.com/hashicorp/nomad/issues/8036)]
  * core: Fixed a bug where an internal metadata, ClusterID, may not be initialized properly upon a slow server upgrade [[GH-8078](https://github.com/hashicorp/nomad/issues/8078)].
  * api: Fixed a bug where setting connect sidecar task resources could fail [[GH-7993](https://github.com/hashicorp/nomad/issues/7993)]
+ * core: Fixed a bug where stop_after_client_disconnect could cause the server to become unresponsive [[GH-8098](https://github.com/hashicorp/nomad/issues/8098)
  * client: Fixed a bug where artifact downloads failed on redirects [[GH-7854](https://github.com/hashicorp/nomad/issues/7854)]
  * csi: Validate empty volume arguments in API. [[GH-8027](https://github.com/hashicorp/nomad/issues/8027)]
 

diff --git a/client/heartbeatstop.go b/client/heartbeatstop.go
@@ -60,7 +60,7 @@ func (h *heartbeatStop) shouldStop(alloc *structs.Allocation) bool {
 func (h *heartbeatStop) shouldStopAfter(now time.Time, interval time.Duration) bool {
 	lastOk := h.getLastOk()
 	if lastOk.IsZero() {
-		return h.startupGrace.After(now)
+		return now.After(h.startupGrace)
 	}
 	return now.After(lastOk.Add(interval))
 }

diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go
@@ -474,7 +474,7 @@ func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan
 		if !fit {
 			// Log the reason why the node's allocations could not be made
 			if reason != "" {
-				logger.Debug("plan for node rejected", "node_id", nodeID, "reason", reason)
+				logger.Debug("plan for node rejected", "node_id", nodeID, "reason", reason, "eval_id", plan.EvalID)
 			}
 			// Set that this is a partial commit
 			partialCommit = true

diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go
@@ -87,6 +87,8 @@ type GenericScheduler struct {
 	ctx        *EvalContext
 	stack      *GenericStack
 
+	// followUpEvals are evals with WaitUntil set, which are delayed until that time
+	// before being rescheduled
 	followUpEvals []*structs.Evaluation
 
 	deployment *structs.Deployment
@@ -258,11 +260,13 @@ func (s *GenericScheduler) process() (bool, error) {
 
 	// If there are failed allocations, we need to create a blocked evaluation
 	// to place the failed allocations when resources become available. If the
-	// current evaluation is already a blocked eval, we reuse it by submitting
-	// a new eval to the planner in createBlockedEval. If the current eval is
-	// pending with WaitUntil set, it's delayed rather than blocked.
+	// current evaluation is already a blocked eval, we reuse it. If not, submit
+	// a new eval to the planner in createBlockedEval. If rescheduling should
+	// be delayed, do that instead.
+	delayInstead := len(s.followUpEvals) > 0 && s.eval.WaitUntil.IsZero()
+
 	if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil &&
-		s.eval.WaitUntil.IsZero() {
+		!delayInstead {
 		if err := s.createBlockedEval(false); err != nil {
 			s.logger.Error("failed to make blocked eval", "error", err)
 			return false, err
@@ -276,8 +280,9 @@ func (s *GenericScheduler) process() (bool, error) {
 		return true, nil
 	}
 
-	// Create follow up evals for any delayed reschedule eligible allocations
-	if len(s.followUpEvals) > 0 {
+	// Create follow up evals for any delayed reschedule eligible allocations, except in
+	// the case that this evaluation was already delayed.
+	if delayInstead {
 		for _, eval := range s.followUpEvals {
 			eval.PreviousEval = s.eval.ID
 			// TODO(preetha) this should be batching evals before inserting them

diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go
@@ -2846,8 +2846,8 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) {
 			require.Equal(t, h.Evals[0].Status, structs.EvalStatusComplete)
 			require.Len(t, h.Plans, 1, "plan")
 
-			// Followup eval created
-			require.True(t, len(h.CreateEvals) > 0)
+			// One followup eval created, either delayed or blocked
+			require.Len(t, h.CreateEvals, 1)
 			e := h.CreateEvals[0]
 			require.Equal(t, eval.ID, e.PreviousEval)
 

diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go
@@ -355,18 +355,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 
 	// Find delays for any lost allocs that have stop_after_client_disconnect
 	lostLater := lost.delayByStopAfterClientDisconnect()
-	rescheduleLater = append(rescheduleLater, lostLater...)
+	a.handleDelayedLost(lostLater, all, tg.Name)
 
 	// Create batched follow up evaluations for allocations that are
 	// reschedulable later and mark the allocations for in place updating
 	a.handleDelayedReschedules(rescheduleLater, all, tg.Name)
 
-	// Allocs that are lost and delayed have an attributeUpdate that correctly links to
-	// the eval, but incorrectly has the current (running) status
-	for _, d := range lostLater {
-		a.result.attributeUpdates[d.allocID].SetStop(structs.AllocClientStatusLost, structs.AllocClientStatusLost)
-	}
-
 	// Create a structure for choosing names. Seed with the taken names which is
 	// the union of untainted and migrating nodes (includes canaries)
 	nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow))
@@ -423,7 +417,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	// * The deployment is not paused or failed
 	// * Not placing any canaries
 	// * If there are any canaries that they have been promoted
-	// * There is no delayed stop_after_client_disconnect alloc
+	// * There is no delayed stop_after_client_disconnect alloc, which delays scheduling for the whole group
 	var place []allocPlaceResult
 	if len(lostLater) == 0 {
 		place = a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow)
@@ -845,6 +839,17 @@ func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted all
 // handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set
 // for allocations that are eligible to be rescheduled later
 func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) {
+	a.handleDelayedReschedulesImpl(rescheduleLater, all, tgName, true)
+}
+
+// handleDelayedLost creates batched followup evaluations with the WaitUntil field set for lost allocations
+func (a *allocReconciler) handleDelayedLost(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) {
+	a.handleDelayedReschedulesImpl(rescheduleLater, all, tgName, false)
+}
+
+// handleDelayedReschedulesImpl creates batched followup evaluations with the WaitUntil field set
+func (a *allocReconciler) handleDelayedReschedulesImpl(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string,
+	createUpdates bool) {
 	if len(rescheduleLater) == 0 {
 		return
 	}
@@ -905,10 +910,12 @@ func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRes
 	}
 
 	// Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID
-	for allocID, evalID := range allocIDToFollowupEvalID {
-		existingAlloc := all[allocID]
-		updatedAlloc := existingAlloc.Copy()
-		updatedAlloc.FollowupEvalID = evalID
-		a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc
+	if createUpdates {
+		for allocID, evalID := range allocIDToFollowupEvalID {
+			existingAlloc := all[allocID]
+			updatedAlloc := existingAlloc.Copy()
+			updatedAlloc.FollowupEvalID = evalID
+			a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc
+		}
 	}
 }