From 281c31e6e576a6648617b4ba8a4d3b3befd6ae2a Mon Sep 17 00:00:00 2001
From: Aaron Lehmann <aaron.lehmann@docker.com>
Date: Mon, 13 Mar 2017 13:54:43 -0700
Subject: [PATCH] scheduler: Fix accounting when task ends up in multiple
 groups

Tasks are split between task groups based on common specs. This allows
nodes to only be ranked once per group, not once per task.

This logic doesn't work correctly because maps are marshalled in a
random order. Currently, the same task can end up in a multiple groups
(say, if it's updated multiple times, and the marshalling ends up being
different). Add a check to make sure we don't try to schedule the same
task twice within the same batch, which can cause resource accounting to
be wrong.

Note this doesn't fix the brokenness of task spec deduplication based on
marshalling the protobuf. This is a fix for the symptom that can be
backported, and I'm going to replace the marshalling stuff in a
different PR.

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
---
 manager/scheduler/scheduler.go | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/manager/scheduler/scheduler.go b/manager/scheduler/scheduler.go
index 55420a2d42..e79c0f0532 100644
--- a/manager/scheduler/scheduler.go
+++ b/manager/scheduler/scheduler.go
@@ -602,6 +602,12 @@ func (s *Scheduler) scheduleNTasksOnNodes(ctx context.Context, n int, taskGroup
 	nodeIter := 0
 	nodeCount := len(nodes)
 	for taskID, t := range taskGroup {
+		// Skip tasks which were already scheduled because they ended
+		// up in two groups at once.
+		if _, exists := schedulingDecisions[taskID]; exists {
+			continue
+		}
+
 		node := &nodes[nodeIter%nodeCount]
 
 		log.G(ctx).WithField("task.id", t.ID).Debugf("assigning to node %s", node.ID)