From d1da2ce020df9f7d0a3fb86594a69f487296e266 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Wed, 24 Aug 2016 17:40:11 -0700
Subject: [PATCH 1/4] Don't fail other tasks when retrying artifact get

The artifact fetching may be retried and succeed, so don't set the task
as dead.

Fixes #1558
---
 client/alloc_runner_test.go | 47 ++++++++++++++++++++++++++++++++++---
 client/task_runner.go       |  2 +-
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go
index 9bdfcf9e83d..fb0082596b8 100644
--- a/client/alloc_runner_test.go
+++ b/client/alloc_runner_test.go
@@ -25,22 +25,24 @@ func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) {
 	m.Allocs = append(m.Allocs, alloc)
 }
 
-func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
+func testAllocRunnerFromAlloc(alloc *structs.Allocation, restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
 	logger := testLogger()
 	conf := config.DefaultConfig()
 	conf.StateDir = os.TempDir()
 	conf.AllocDir = os.TempDir()
 	upd := &MockAllocStateUpdater{}
-	alloc := mock.Alloc()
 	if !restarts {
 		*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
 		alloc.Job.Type = structs.JobTypeBatch
 	}
-
 	ar := NewAllocRunner(logger, conf, upd.Update, alloc)
 	return upd, ar
 }
 
+func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
+	return testAllocRunnerFromAlloc(mock.Alloc(), restarts)
+}
+
 func TestAllocRunner_SimpleRun(t *testing.T) {
 	ctestutil.ExecCompatible(t)
 	upd, ar := testAllocRunner(false)
@@ -61,6 +63,45 @@ func TestAllocRunner_SimpleRun(t *testing.T) {
 	})
 }
 
+// TestAllocRuner_RetryArtifact ensures that if one task in a task group is
+// retrying fetching an artifact, other tasks in the the group should be able
+// to proceed. See #1558
+func TestAllocRunner_RetryArtifact(t *testing.T) {
+	ctestutil.ExecCompatible(t)
+
+	alloc := mock.Alloc()
+
+	// Create a copy of the task for testing #1558
+	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()
+	badtask.Name = "bad"
+
+	// Add a bad artifact to one of the tasks
+	badtask.Artifacts = []*structs.TaskArtifact{
+		{GetterSource: "http://127.1.1.111:12315/foo/bar/baz"},
+	}
+
+	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask)
+	upd, ar := testAllocRunnerFromAlloc(alloc, true)
+	go ar.Run()
+	defer ar.Destroy()
+
+	testutil.WaitForResult(func() (bool, error) {
+		if upd.Count < 6 {
+			return false, fmt.Errorf("Not enough updates")
+		}
+		last := upd.Allocs[upd.Count-1]
+		if last.TaskStates["web"].State != structs.TaskStatePending {
+			return false, fmt.Errorf("expected web to be pending but found %q", last.TaskStates["web"].State)
+		}
+		if last.TaskStates["bad"].State != structs.TaskStatePending {
+			return false, fmt.Errorf("expected bad to be pending but found %q", last.TaskStates["web"].State)
+		}
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+}
+
 func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
 	ctestutil.ExecCompatible(t)
 	upd, ar := testAllocRunner(false)
diff --git a/client/task_runner.go b/client/task_runner.go
index 0544a1eae94..76b19f28cfc 100644
--- a/client/task_runner.go
+++ b/client/task_runner.go
@@ -318,7 +318,7 @@ func (r *TaskRunner) run() {
 
 			for _, artifact := range r.task.Artifacts {
 				if err := getter.GetArtifact(r.taskEnv, artifact, taskDir); err != nil {
-					r.setState(structs.TaskStateDead,
+					r.setState(structs.TaskStatePending,
 						structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(err))
 					r.restartTracker.SetStartError(dstructs.NewRecoverableError(err, true))
 					goto RESTART

From 2a88e48f07626c46da725ba52cb80c2b543b54a8 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Thu, 25 Aug 2016 14:42:50 -0700
Subject: [PATCH 2/4] Ensure web task exited successfully

Web task should run to completion successfully while the `bad` task is
retrying artifact downloads.
---
 client/alloc_runner_test.go | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go
index fb0082596b8..d8b48b7054b 100644
--- a/client/alloc_runner_test.go
+++ b/client/alloc_runner_test.go
@@ -65,17 +65,16 @@ func TestAllocRunner_SimpleRun(t *testing.T) {
 
 // TestAllocRuner_RetryArtifact ensures that if one task in a task group is
 // retrying fetching an artifact, other tasks in the the group should be able
-// to proceed. See #1558
+// to proceed.
 func TestAllocRunner_RetryArtifact(t *testing.T) {
 	ctestutil.ExecCompatible(t)
 
 	alloc := mock.Alloc()
+	alloc.Job.Type = structs.JobTypeBatch
 
-	// Create a copy of the task for testing #1558
+	// Create a new task with a bad artifact
 	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()
 	badtask.Name = "bad"
-
-	// Add a bad artifact to one of the tasks
 	badtask.Artifacts = []*structs.TaskArtifact{
 		{GetterSource: "http://127.1.1.111:12315/foo/bar/baz"},
 	}
@@ -90,8 +89,12 @@ func TestAllocRunner_RetryArtifact(t *testing.T) {
 			return false, fmt.Errorf("Not enough updates")
 		}
 		last := upd.Allocs[upd.Count-1]
-		if last.TaskStates["web"].State != structs.TaskStatePending {
-			return false, fmt.Errorf("expected web to be pending but found %q", last.TaskStates["web"].State)
+		webstate := last.TaskStates["web"]
+		if webstate.State != structs.TaskStateDead {
+			return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State)
+		}
+		if !webstate.Successful() {
+			return false, fmt.Errorf("expected web to have exited successfully")
 		}
 		if last.TaskStates["bad"].State != structs.TaskStatePending {
 			return false, fmt.Errorf("expected bad to be pending but found %q", last.TaskStates["web"].State)

From bbce6c7ad19b066f5e24ef344262fc9def0badb3 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Thu, 25 Aug 2016 16:05:19 -0700
Subject: [PATCH 3/4] Ensure bad task reaches terminal state within test

---
 client/alloc_runner_test.go | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go
index d8b48b7054b..c6003a29670 100644
--- a/client/alloc_runner_test.go
+++ b/client/alloc_runner_test.go
@@ -71,6 +71,7 @@ func TestAllocRunner_RetryArtifact(t *testing.T) {
 
 	alloc := mock.Alloc()
 	alloc.Job.Type = structs.JobTypeBatch
+	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
 
 	// Create a new task with a bad artifact
 	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()
@@ -89,6 +90,9 @@ func TestAllocRunner_RetryArtifact(t *testing.T) {
 			return false, fmt.Errorf("Not enough updates")
 		}
 		last := upd.Allocs[upd.Count-1]
+
+		// web task should have completed successfully while bad task
+		// retries artififact fetching
 		webstate := last.TaskStates["web"]
 		if webstate.State != structs.TaskStateDead {
 			return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State)
@@ -96,8 +100,14 @@ func TestAllocRunner_RetryArtifact(t *testing.T) {
 		if !webstate.Successful() {
 			return false, fmt.Errorf("expected web to have exited successfully")
 		}
-		if last.TaskStates["bad"].State != structs.TaskStatePending {
-			return false, fmt.Errorf("expected bad to be pending but found %q", last.TaskStates["web"].State)
+
+		// bad task should have failed
+		badstate := last.TaskStates["bad"]
+		if badstate.State != structs.TaskStateDead {
+			return false, fmt.Errorf("expected bad to be dead but found %q", last.TaskStates["web"].State)
+		}
+		if !badstate.Failed() {
+			return false, fmt.Errorf("expected bad to have failed")
 		}
 		return true, nil
 	}, func(err error) {

From 5b0dd7ff2ba5df356970dd95d72850a00b90f69c Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Thu, 25 Aug 2016 17:25:51 -0700
Subject: [PATCH 4/4] Make sure bad doesn't fail before web runs

---
 client/alloc_runner_test.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go
index c6003a29670..6ee1f033949 100644
--- a/client/alloc_runner_test.go
+++ b/client/alloc_runner_test.go
@@ -71,7 +71,8 @@ func TestAllocRunner_RetryArtifact(t *testing.T) {
 
 	alloc := mock.Alloc()
 	alloc.Job.Type = structs.JobTypeBatch
-	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
+	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 1
+	alloc.Job.TaskGroups[0].RestartPolicy.Delay = time.Duration(4*testutil.TestMultiplier()) * time.Second
 
 	// Create a new task with a bad artifact
 	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()