hashicorp · lgfa29 · Mar 24, 2023 · Mar 21, 2023 · Mar 22, 2023 · Mar 22, 2023
@@ -0,0 +1,3 @@
+```release-note:bug
+scheduler: Fix reconciliation of reconnecting allocs when the replacement allocations are not running
+```
@@ -428,6 +428,34 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool {
 	untainted, migrate, lost, disconnecting, reconnecting, ignore := all.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now)
 	desiredChanges.Ignore += uint64(len(ignore))
 
+	// If there are allocations reconnecting we need to reconcile them and
+	// their replacements first because there is specific logic when deciding
+	// which ones to keep that can only be applied when the client reconnects.
+	if len(reconnecting) > 0 {
+		// Pass all allocations becasue the replacements we need to find may be
+		// in any state, including themselves being reconnected.
+		reconnect, stop := a.reconcileReconnecting(reconnecting, all)
+
+		// Stop the reconciled allocations and remove them from the other sets
+		// since they have been already handled.
+		desiredChanges.Stop += uint64(len(stop))
+
+		untainted = untainted.difference(stop)
+		migrate = migrate.difference(stop)
+		lost = lost.difference(stop)
+		disconnecting = disconnecting.difference(stop)
+		reconnecting = reconnecting.difference(stop)
+		ignore = ignore.difference(stop)
+
+		// Validate and add reconnecting allocations to the plan so they are
+		// logged.
+		a.computeReconnecting(reconnect)
+
+		// The rest of the reconnecting allocations is now untainted and will
+		// be further reconciled below.
+		untainted = untainted.union(reconnect)
+	}
+
 	// Determine what set of terminal allocations need to be rescheduled
 	untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, false, a.now, a.evalID, a.deployment)
 
@@ -459,14 +487,10 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool {
 	// Stop any unneeded allocations and update the untainted set to not
 	// include stopped allocations.
 	isCanarying := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
-	stop, reconnecting := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, reconnecting, isCanarying, lostLaterEvals)
+	stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, isCanarying, lostLaterEvals)
 	desiredChanges.Stop += uint64(len(stop))
 	untainted = untainted.difference(stop)
 
-	// Validate and add reconnecting allocs to the plan so that they will be logged.
-	a.computeReconnecting(reconnecting)
-	desiredChanges.Ignore += uint64(len(a.result.reconnectUpdates))
-
 	// Do inplace upgrades where possible and capture the set of upgrades that
 	// need to be done destructively.
 	ignore, inplace, destructive := a.computeUpdates(tg, untainted)
@@ -496,10 +520,9 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool {
 	// * If there are any canaries that they have been promoted
 	// * There is no delayed stop_after_client_disconnect alloc, which delays scheduling for the whole group
 	// * An alloc was lost
-	// * There is not a corresponding reconnecting alloc.
 	var place []allocPlaceResult
 	if len(lostLater) == 0 {
-		place = a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow, lost, reconnecting, isCanarying)
+		place = a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow, lost, isCanarying)
 		if !existingDeployment {
 			dstate.DesiredTotal += len(place)
 		}
@@ -705,7 +728,7 @@ func (a *allocReconciler) computeUnderProvisionedBy(group *structs.TaskGroup, un
 //
 // Placements will meet or exceed group count.
 func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
-	nameIndex *allocNameIndex, untainted, migrate, reschedule, lost, reconnecting allocSet,
+	nameIndex *allocNameIndex, untainted, migrate, reschedule, lost allocSet,
 	isCanarying bool) []allocPlaceResult {
 
 	// Add rescheduled placement results
@@ -725,7 +748,7 @@ func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
 	}
 
 	// Add replacements for disconnected and lost allocs up to group.Count
-	existing := len(untainted) + len(migrate) + len(reschedule) + len(reconnecting) - len(reconnecting.filterByFailedReconnect())
+	existing := len(untainted) + len(migrate) + len(reschedule)
 
 	// Add replacements for lost
 	for _, alloc := range lost {
@@ -935,28 +958,22 @@ func (a *allocReconciler) isDeploymentComplete(groupName string, destructive, in
 // the group definition, the set of allocations in various states and whether we
 // are canarying.
 func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex,
-	untainted, migrate, lost, canaries, reconnecting allocSet, isCanarying bool, followupEvals map[string]string) (allocSet, allocSet) {
+	untainted, migrate, lost, canaries allocSet, isCanarying bool, followupEvals map[string]string) allocSet {
 
 	// Mark all lost allocations for stop.
 	var stop allocSet
 	stop = stop.union(lost)
 	a.markDelayed(lost, structs.AllocClientStatusLost, allocLost, followupEvals)
 
-	// Mark all failed reconnects for stop.
-	failedReconnects := reconnecting.filterByFailedReconnect()
-	stop = stop.union(failedReconnects)
-	a.markStop(failedReconnects, structs.AllocClientStatusFailed, allocRescheduled)
-	reconnecting = reconnecting.difference(failedReconnects)
-
 	// If we are still deploying or creating canaries, don't stop them
 	if isCanarying {
 		untainted = untainted.difference(canaries)
 	}
 
 	// Hot path the nothing to do case
-	remove := len(untainted) + len(migrate) + len(reconnecting) - group.Count
+	remove := len(untainted) + len(migrate) - group.Count
 	if remove <= 0 {
-		return stop, reconnecting
+		return stop
 	}
 
 	// Filter out any terminal allocations from the untainted set
@@ -978,7 +995,7 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
 
 				remove--
 				if remove == 0 {
-					return stop, reconnecting
+					return stop
 				}
 			}
 		}
@@ -1002,19 +1019,11 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
 
 			remove--
 			if remove == 0 {
-				return stop, reconnecting
+				return stop
 			}
 		}
 	}
 
-	// Handle allocs that might be able to reconnect.
-	if len(reconnecting) != 0 {
-		remove = a.computeStopByReconnecting(untainted, reconnecting, stop, remove)
-		if remove == 0 {
-			return stop, reconnecting
-		}
-	}
-
 	// Select the allocs with the highest count to remove
 	removeNames := nameIndex.Highest(uint(remove))
 	for id, alloc := range untainted {
@@ -1028,7 +1037,7 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
 
 			remove--
 			if remove == 0 {
-				return stop, reconnecting
+				return stop
 			}
 		}
 	}
@@ -1045,95 +1054,152 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
 
 		remove--
 		if remove == 0 {
-			return stop, reconnecting
+			return stop
 		}
 	}
 
-	return stop, reconnecting
+	return stop
 }
 
-// computeStopByReconnecting moves allocations from either the untainted or reconnecting
-// sets to the stop set and returns the number of allocations that still need to be removed.
-func (a *allocReconciler) computeStopByReconnecting(untainted, reconnecting, stop allocSet, remove int) int {
-	if remove == 0 {
-		return remove
-	}
+// reconcileReconnecting receives the set of allocations that are reconnecting
+// and all other allocations for the same group and determines which ones to
+// reconnect which ones or stop.
+//
+//   - Every reconnecting allocation MUST be present in one, and only one, of
+//     the returned set.
+//   - Every replacement allocation that is not preferred MUST be returned in
+//     the stop set.
+//   - Only reconnecting allocations are allowed to be present in the returned
+//     reconnect set.
+//   - If the reconnecting allocation is to be stopped, its replacements may
+//     not be present in any of the returned sets. The rest of the reconciler
+//     logic will handle them.
+func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, others allocSet) (allocSet, allocSet) {
+	stop := make(allocSet)
+	reconnect := make(allocSet)
+
+	// Mark all failed reconnects for stop.
+	failedReconnects := reconnecting.filterByFailedReconnect()
+	stop = stop.union(failedReconnects)
+	a.markStop(failedReconnects, structs.AllocClientStatusFailed, allocRescheduled)
+	successfulReconnects := reconnecting.difference(failedReconnects)
 
-	for _, reconnectingAlloc := range reconnecting {
-		// if the desired status is not run, or if the user-specified desired
+	for _, reconnectingAlloc := range successfulReconnects {
+		// If the desired status is not run, or if the user-specified desired
 		// transition is not run, stop the reconnecting allocation.
-		if reconnectingAlloc.DesiredStatus != structs.AllocDesiredStatusRun ||
+		stopReconnecting := reconnectingAlloc.DesiredStatus != structs.AllocDesiredStatusRun ||
 			reconnectingAlloc.DesiredTransition.ShouldMigrate() ||
 			reconnectingAlloc.DesiredTransition.ShouldReschedule() ||
 			reconnectingAlloc.DesiredTransition.ShouldForceReschedule() ||
 			reconnectingAlloc.Job.Version < a.job.Version ||
-			reconnectingAlloc.Job.CreateIndex < a.job.CreateIndex {
+			reconnectingAlloc.Job.CreateIndex < a.job.CreateIndex
 
+		if stopReconnecting {
 			stop[reconnectingAlloc.ID] = reconnectingAlloc
 			a.result.stop = append(a.result.stop, allocStopResult{
 				alloc:             reconnectingAlloc,
 				statusDescription: allocNotNeeded,
 			})
-			delete(reconnecting, reconnectingAlloc.ID)
-
-			remove--
-			// if we've removed all we need to, stop iterating and return.
-			if remove == 0 {
-				return remove
-			}
 			continue
 		}
 
-		// Compare reconnecting to untainted and decide which to keep.
-		for _, untaintedAlloc := range untainted {
-			// If not a match by name and previous alloc continue
-			if reconnectingAlloc.Name != untaintedAlloc.Name {
+		// By default, prefer stopping the replacement allocation, so assume
+		// all non-terminal reconnecting allocations will reconnect and ajust
+		// later when this is not the case.
+		reconnect[reconnectingAlloc.ID] = reconnectingAlloc
+
+		// Find replacement allocations and decide which one to stop. A
+		// reconnecting allocation may have multiple replacements.
+		for _, replacementAlloc := range others {
+			// Skip the reconnecting allocation itself.
+			// Since a replacement may itslef be reconnecting, the set of all
+			// other allocations contains the reconnecting allocations as well.
+			if reconnectingAlloc.ID == replacementAlloc.ID {
 				continue
 			}
 
-			// By default, we prefer stopping the replacement alloc unless
-			// the replacement has a higher metrics score.
-			stopAlloc := untaintedAlloc
-			deleteSet := untainted
-			untaintedMaxScoreMeta := untaintedAlloc.Metrics.MaxNormScore()
-			reconnectingMaxScoreMeta := reconnectingAlloc.Metrics.MaxNormScore()
-
-			if untaintedMaxScoreMeta == nil {
-				a.logger.Error("error computing stop: replacement allocation metrics not available", "alloc_name", untaintedAlloc.Name, "alloc_id", untaintedAlloc.ID)
+			// Skip allocations that don't have the same name.
+			// Replacement allocations have the same name as the original.
+			if reconnectingAlloc.Name != replacementAlloc.Name {
 				continue
 			}
 
-			if reconnectingMaxScoreMeta == nil {
-				a.logger.Error("error computing stop: reconnecting allocation metrics not available", "alloc_name", reconnectingAlloc.Name, "alloc_id", reconnectingAlloc.ID)
+			// Skip allocations that are server terminal.
+			// We don't want to replace a reconnecting allocation with one that
+			// is or will terminate and we don't need to stop them since they
+			// are marked as terminal by the servers.
+			if replacementAlloc.ServerTerminalStatus() {
 				continue
 			}
 
-			statusDescription := allocNotNeeded
-			if untaintedAlloc.Job.Version > reconnectingAlloc.Job.Version ||
-				untaintedAlloc.Job.CreateIndex > reconnectingAlloc.Job.CreateIndex ||
-				untaintedMaxScoreMeta.NormScore > reconnectingMaxScoreMeta.NormScore {
-				stopAlloc = reconnectingAlloc
-				deleteSet = reconnecting
-			} else {
-				statusDescription = allocReconnected
-			}
-
-			stop[stopAlloc.ID] = stopAlloc
-			a.result.stop = append(a.result.stop, allocStopResult{
-				alloc:             stopAlloc,
-				statusDescription: statusDescription,
-			})
-			delete(deleteSet, stopAlloc.ID)
+			// Pick which allocation we want to keep.
+			keepAlloc := pickReconnectingAlloc(reconnectingAlloc, replacementAlloc)
+			if keepAlloc == replacementAlloc {
+				// The replacement allocation is preferred, so stop the one
+				// reconnecting if not stopped yet.
+				if _, ok := stop[reconnectingAlloc.ID]; !ok {
+					stop[reconnectingAlloc.ID] = reconnectingAlloc
+					a.result.stop = append(a.result.stop, allocStopResult{
+						alloc:             reconnectingAlloc,
+						statusDescription: allocNotNeeded,
+					})
+				}
 
-			remove--
-			// if we've removed all we need to, stop iterating and return.
-			if remove == 0 {
-				return remove
+				// Our assumption that all reconnecting allocations will
+				// reconnect was wrong, so remove this one from the reconnect
+				// set.
+				delete(reconnect, reconnectingAlloc.ID)
+			} else {
+				// The reconnecting allocation is preferred, so stop this
+				// replacement.
+				stop[replacementAlloc.ID] = replacementAlloc
+				a.result.stop = append(a.result.stop, allocStopResult{
+					alloc:             replacementAlloc,
+					statusDescription: allocReconnected,
+				})
 			}
 		}
 	}
 
-	return remove
+	return reconnect, stop
+}
+
+// pickReconnectingAlloc returns the allocation to keep between the original
+// one that is reconnecting and one of its replacements.
+//
+// This function is not commutative, meaning that pickReconnectingAlloc(A, B)
+// is not the same as pickReconnectingAlloc(B, A). Preference is given to keep
+// the original allocation when possible.
+func pickReconnectingAlloc(original *structs.Allocation, replacement *structs.Allocation) *structs.Allocation {
+	// Check if the replacement is newer.
+	// Always prefer the replacement if true.
+	replacementIsNewer := replacement.Job.Version > original.Job.Version ||
+		replacement.Job.CreateIndex > original.Job.CreateIndex
+	if replacementIsNewer {
+		return replacement
+	}
+
+	// Check if the replacement has better placement score.
+	// If any of the scores is not available, only pick the replacement if
+	// itself does have scores.
+	originalMaxScoreMeta := original.Metrics.MaxNormScore()
+	replacementMaxScoreMeta := replacement.Metrics.MaxNormScore()
+
+	replacementHasBetterScore := originalMaxScoreMeta == nil && replacementMaxScoreMeta != nil ||
+		(originalMaxScoreMeta != nil && replacementMaxScoreMeta != nil &&
+			replacementMaxScoreMeta.NormScore > originalMaxScoreMeta.NormScore)
+
+	// Check if the replacement has better client status.
+	// Even with a better placement score make sure we don't replace a running
+	// allocation with one that is not.
+	replacementIsRunning := replacement.ClientStatus == structs.AllocClientStatusRunning
+	originalNotRunning := original.ClientStatus != structs.AllocClientStatusRunning
 if !alloc.ServerTerminalStatus() && alloc.ClientStatus == structs.AllocClientStatusFailed { 
 if !alloc.ServerTerminalStatus() && alloc.ClientStatus == structs.AllocClientStatusFailed { 
+
+	if replacementHasBetterScore && (replacementIsRunning || originalNotRunning) {
+		return replacement
+	}
+
+	return original
 }
 
 // computeUpdates determines which allocations for the passed group require